Skip to content

Instantly share code, notes, and snippets.

@anasAlsalol
Last active June 6, 2020 07:47
Show Gist options
  • Select an option

  • Save anasAlsalol/fd1056dc686f8ad25f1d313042106189 to your computer and use it in GitHub Desktop.

Select an option

Save anasAlsalol/fd1056dc686f8ad25f1d313042106189 to your computer and use it in GitHub Desktop.
FirstDSArabicClassification.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Copy of FirstDSArabicClassification.ipynb",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/anasAlsalol/fd1056dc686f8ad25f1d313042106189/copy-of-firstdsarabicclassification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "G82daw0gkO3g",
"colab_type": "code",
"outputId": "d82dcc41-9e13-450e-bf77-ec201515a670",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 459
}
},
"source": [
"# Import PyDrive and associated libraries.\n",
"# https://www.kaggle.com/muhammedfathi/mutli-class-arabic-news-classifictation-with-dl/data\n",
"!wget 'https://storage.googleapis.com/kaggle-data-sets/422485%2F806087%2Fcompressed%2Farabic_categorization_data.csv.zip?GoogleAccessId=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com&Expires=1591214331&Signature=V1dMO9XvxJGQ8FbumHfUOGPlsVAYmgaRGiPPXT3Xck8LLteUQLtLdIOL7CZTRvtFqCWGrCmgimhQJW7EYuFjq%2FdT1RaqN4g%2FnNgGxSbe3cbkwY2dcL%2FhlgWaMYLmh2a%2FbNUT1JYk1qezDddwiD1O3QRyzBZooPeZ84gkwyH5N3JbFhlJEfYdVlxgUp6dlyqQXhS5LymT09YfYK8KcasIhng2g08UCJY2l%2F5%2FOWZqSfMELTTZv1o2DMJutbw48PXSRFa7%2FuhQMeHRu2Eg328DcUkwJtzwgpJl5GAstIg2rUN%2FerBWl9m7HJ%2B2YEU5RkmyHlsUEeutmVErAJGM1%2BJJEw%3D%3D' -O arabic_dataset_classifiction.csv.zip\n",
"!unzip arabic_dataset_classifiction.csv.zip \n",
"!head arabic_categorization_data.csv"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-05-31 19:59:36-- https://storage.googleapis.com/kaggle-data-sets/422485%2F806087%2Fcompressed%2Farabic_categorization_data.csv.zip?GoogleAccessId=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com&Expires=1591214331&Signature=V1dMO9XvxJGQ8FbumHfUOGPlsVAYmgaRGiPPXT3Xck8LLteUQLtLdIOL7CZTRvtFqCWGrCmgimhQJW7EYuFjq%2FdT1RaqN4g%2FnNgGxSbe3cbkwY2dcL%2FhlgWaMYLmh2a%2FbNUT1JYk1qezDddwiD1O3QRyzBZooPeZ84gkwyH5N3JbFhlJEfYdVlxgUp6dlyqQXhS5LymT09YfYK8KcasIhng2g08UCJY2l%2F5%2FOWZqSfMELTTZv1o2DMJutbw48PXSRFa7%2FuhQMeHRu2Eg328DcUkwJtzwgpJl5GAstIg2rUN%2FerBWl9m7HJ%2B2YEU5RkmyHlsUEeutmVErAJGM1%2BJJEw%3D%3D\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.189.128, 2404:6800:4008:c02::80\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.189.128|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 3764777 (3.6M) [application/zip]\n",
"Saving to: ‘arabic_dataset_classifiction.csv.zip’\n",
"\n",
"arabic_dataset_clas 100%[===================>] 3.59M --.-KB/s in 0.02s \n",
"\n",
"2020-05-31 19:59:37 (201 MB/s) - ‘arabic_dataset_classifiction.csv.zip’ saved [3764777/3764777]\n",
"\n",
"Archive: arabic_dataset_classifiction.csv.zip\n",
" inflating: arabic_categorization_data.csv \n",
",text,type\n",
"0,\"\n",
"أشرف رئيس الجمهورية الباجي قايد السبسي اليوم بقصر قرطاج على موكب منح الوسام الوطني للاستحقاق الثقافي لثلّة من الفنانين والمبدعين و ذلك بمناسبة انعقاد أيام قرطاج السينمائية.\n",
"والفنانون هم : *الصنف الأوّل: - عبد الرحمان سيساكو (موريتانيا) - جميل راتب (مصر) - ميشال خليفي (فلسطين) - ادريسا ودراغو (بوركينا فاسو) - محمد ملص (سوريا) - رضا الباهي (تونس: تعذّر عليه الحضور) - عمر الخليفي (تونس) *الصنف الثاني: - عبد العزيز بن ملوكة (تونس) - نجيب عيّاد (تونس) - منصف شرف الدين (تونس: تعذّر عليه الحضور) *الصنف الثالث: - ابراهيم اللطيّف (تونس) - خميس الخياطي (تونس: تعذّر عليه الحضور) - درّة زرّوق (تونس: تعذّر عليها الحضور) - شوقي الماجري (تونس) *الصنف الرابع: - كوثر بن هنيّة (تونس)\n",
"\",culture\n",
"1,\"\n",
"تحصل كتاب \"\"المصحف وقراءاته\"\" الذي ألفه باحثون تونسيون متخصصون، على جائزة عربية على هامش افتتاح معرض بيروت العربي الدولي للكتاب الذي افتتح امس ويتواصل إلى يوم 14 ديسمبر الحالي.\n",
"وأسند النادي الثقافي العربي الجائزة الثالثة لـ: \"\"المصحف وقراءاته\"\"، الذي جاء في خمسة مجلدات وتم تصنيفه من طرف مجموعة من الباحثين بإشراف الباحث التونسي في الحضارة الاسلامية عبد المجيد الشرقي ونشره من قبل مؤسسة \"\"مؤمنون بلا حدود للدراسات والنشر\"\". ع ب م \n",
"\",culture\n",
"2,\"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "A2mBja1RkoSw",
"colab_type": "code",
"outputId": "fcccbe10-e199-46d0-ff78-9627c6e10804",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
}
},
"source": [
"import nltk\n",
"nltk.download('stopwords')\n",
"import seaborn as sns\n",
"%matplotlib inline\n",
"import pandas.util.testing as tm"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "qlI2DqOvk9i9",
"colab_type": "code",
"colab": {}
},
"source": [
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
"from sklearn.model_selection import train_test_split, KFold\n",
"import re\n",
"from string import punctuation\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem.snowball import SnowballStemmer\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.neural_network import MLPClassifier \n",
"from sklearn.tree import DecisionTreeClassifier \n",
"from sklearn.ensemble import BaggingClassifier\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.naive_bayes import GaussianNB \n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.naive_bayes import MultinomialNB, BernoulliNB\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"\n",
"# metric\n",
"from sklearn.metrics import classification_report,confusion_matrix , accuracy_score\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"stemmer = SnowballStemmer(\"arabic\")\n",
"_stopwords = set(stopwords.words('arabic') + list(punctuation))\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "mDc-9KhllC49",
"colab_type": "code",
"colab": {}
},
"source": [
"def stopwords(text):\n",
" '''a function for removing the stopword'''\n",
" # removing the stop words and lowercasing the selected words\n",
" text = [word.lower() for word in text.split() if word.lower() not in _stopwords]\n",
" # joining the list of words with space separator\n",
" return \" \".join(text)\n",
"\n",
"\n",
"def _processNews(news):\n",
" news = news.lower() # convert text to lower-case\n",
" news = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', 'URL', news) # remove URLs\n",
" news = re.sub(r'#([^\\s]+)', r'\\1', news) # remove the # in #hashtag\n",
" news = re.sub('[a-zA-Z]', ' ', news)\n",
" news = re.sub('[^0-9\\u0600-\\u06ff\\u0750-\\u077f\\ufb50-\\ufbc1\\ufbd3-\\ufd3f\\ufd50-\\ufd8f\\ufd50-\\ufd8f\\ufe70-\\ufefc\\uFDF0-\\uFDFD]', ' ', news)\n",
" return news\n",
"\n",
"\n",
"def stemming(text):\n",
" '''a function which stems each word in the given text'''\n",
" try:\n",
" text = [stemmer.stem(word) for word in text.split()]\n",
" except:\n",
" text = text \n",
" return \" \".join(text)\n",
"\n",
"def evaluate_classification(X, y):\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)\n",
" model_name_list = ['LinearSVC', 'MultinomialNB',\n",
" 'BernoulliNB', 'SGDClassifier', 'DecisionTreeClassifier',\n",
" 'RandomForestClassifier',\n",
" 'AdaBoostClassifier', 'KNeighborsClassifier', 'ensemble-Learning' , 'scikit_log_reg'] \n",
"\n",
" # Instantiate the models\n",
" adaboost = AdaBoostClassifier(base_estimator=LinearSVC() , n_estimators=10,\n",
" learning_rate=0.01 , algorithm='SAMME')\n",
" scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)\n",
" voting = VotingClassifier(estimators=[('dt' , scikit_log_reg),('knn' , SGDClassifier()),( 'mlp' , LinearSVC()) , ( 'MB', MultinomialNB()) , ('Be',BernoulliNB())],voting='hard',n_jobs=-1)\n",
"\n",
" # Dataframe for results\n",
" results = pd.DataFrame(columns=['accuracy'], index=model_name_list)\n",
"\n",
" # Train and predict with each model \n",
"\n",
" for i, model in enumerate([LinearSVC(), MultinomialNB(),\n",
" BernoulliNB(), SGDClassifier(), DecisionTreeClassifier(max_depth=5),\n",
" RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),adaboost,\n",
" KNeighborsClassifier(3,n_jobs=-1), voting,scikit_log_reg]):\n",
" model.fit(X_train, y_train)\n",
" predictions = model.predict(X_test)\n",
"\n",
" # Metrics\n",
" model_name = model_name_list[i]\n",
" print('Result For ', model_name)\n",
" acc = accuracy_score(y_test, predictions)\n",
" confusion = confusion_matrix(y_test, predictions)\n",
" report = classification_report(y_test, predictions)\n",
"\n",
" print(\"Accuracy:\", acc)\n",
" print(\"Confusion Matrix:\\n\", confusion)\n",
" print(\"Classification Report:\\n\", report)\n",
" # Metrics\n",
" acc = accuracy_score(y_test, predictions)\n",
" # Insert results into the dataframe\n",
" model_name = model_name_list[i]\n",
" results.loc[model_name, :] = [acc]\n",
" return results \n",
"\n",
"\n",
"def single_calssifier(X, y):\n",
" \n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)\n",
" model_name_list = ['single']\n",
" scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000) \n",
" dt = DecisionTreeClassifier(min_samples_split=40 , criterion=\"gini\")\n",
"\n",
" voting = VotingClassifier(estimators=[('dt' , scikit_log_reg),('knn' , SGDClassifier()),( 'mlp' , LinearSVC()) , ( 'MB', MultinomialNB()) , ('Be',BernoulliNB())],voting='hard',n_jobs=-1)\n",
" bagging = BaggingClassifier(SGDClassifier(),n_estimators=8,max_samples=1000,bootstrap=True,n_jobs=-1)\n",
" adaboost = AdaBoostClassifier(base_estimator=LinearSVC() , n_estimators=10,\n",
" learning_rate=0.1 , algorithm='SAMME' )\n",
" # Dataframe for results\n",
" results = pd.DataFrame(columns=['accuracy'], index=model_name_list)\n",
"\n",
" # Train and predict with each model\n",
" for i, model in enumerate([adaboost]):\n",
" model.fit(X_train, y_train)\n",
" predictions = model.predict(X_test)\n",
"\n",
" # Metrics\n",
" model_name = model_name_list[i]\n",
" print('Result For ', model_name)\n",
" acc = accuracy_score(y_test, predictions)\n",
" confusion = confusion_matrix(y_test, predictions)\n",
" report = classification_report(y_test, predictions)\n",
"\n",
" print(\"Accuracy:\", acc)\n",
" print(\"Confusion Matrix:\\n\", confusion)\n",
" print(\"Classification Report:\\n\", report)\n",
"\n",
" # Metrics\n",
" acc = accuracy_score(y_test, predictions)\n",
" # Insert results into the dataframe\n",
" model_name = model_name_list[i]\n",
" results.loc[model_name, :] = [acc]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uv0Snu0-lHyR",
"colab_type": "code",
"outputId": "b877c928-ab89-41b5-eeb1-593e16653ca7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
}
},
"source": [
"trainingData = pd.read_csv(\"arabic_categorization_data.csv\")\n",
"trainingData.head()\n",
"trainingData.rename(columns={'type':'label'}, inplace=True)\n",
"trainingData = trainingData.drop(trainingData.columns[0], axis=1)\n",
"trainingData.head()"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>\\nأشرف رئيس الجمهورية الباجي قايد السبسي اليوم...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>\\nتحصل كتاب \"المصحف وقراءاته\" الذي ألفه باحثون...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>\\nاستنكرت إدارة المسرح الوطني التونسي الحملة ا...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>\\nاحتضن جناح تونس في القرية الدولية للأفلام بم...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>\\nشهدت برلين أمس الجمعة افتتاح مسجد فريد من نو...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 \\nأشرف رئيس الجمهورية الباجي قايد السبسي اليوم... culture\n",
"1 \\nتحصل كتاب \"المصحف وقراءاته\" الذي ألفه باحثون... culture\n",
"2 \\nاستنكرت إدارة المسرح الوطني التونسي الحملة ا... culture\n",
"3 \\nاحتضن جناح تونس في القرية الدولية للأفلام بم... culture\n",
"4 \\nشهدت برلين أمس الجمعة افتتاح مسجد فريد من نو... culture"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ha3hNMTnlMwf",
"colab_type": "code",
"colab": {}
},
"source": [
"trainingData['text'] [0]\n",
"trainingData = trainingData[trainingData['text'].notnull()]\n",
"#!pip install --upgrade nltk"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "XZ8pWmhFlP-1",
"colab_type": "code",
"colab": {}
},
"source": [
"trainingData['text'] = trainingData['text'].apply(stopwords)\n",
"trainingData['text'] = trainingData['text'].apply(_processNews)\n",
"#trainingData['text'] = trainingData['text'].apply(stemming)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Mk9FHcCrlTLt",
"colab_type": "code",
"outputId": "b2a4c12f-6307-4f8e-ad32-4d9db64af89f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
}
},
"source": [
"trainingData.head(10)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>أشرف رئيس الجمهورية الباجي قايد السبسي اليوم ب...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>تحصل كتاب المصحف وقراءاته ألفه باحثون تونسيو...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>استنكرت إدارة المسرح الوطني التونسي الحملة شنه...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>احتضن جناح تونس القرية الدولية للأفلام بمدينة ...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>شهدت برلين أمس الجمعة افتتاح مسجد فريد نوعه ال...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>نعت وزارة الشّؤون الثّقافيّة المنشد الصّوفي عز...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>أعلنت وزارة الشؤون الثقافية بلاغ اليوم الاثنين...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>استضاف برنامج سينما سينما الأربعاء 18 جانفي ...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>تنطلق فعاليات التظاهرة الموسيقية الالكترونية ص...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>ينطلق مهرجان القيروان للشعر العربي ببيت الشعر ...</td>\n",
" <td>culture</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 أشرف رئيس الجمهورية الباجي قايد السبسي اليوم ب... culture\n",
"1 تحصل كتاب المصحف وقراءاته ألفه باحثون تونسيو... culture\n",
"2 استنكرت إدارة المسرح الوطني التونسي الحملة شنه... culture\n",
"3 احتضن جناح تونس القرية الدولية للأفلام بمدينة ... culture\n",
"4 شهدت برلين أمس الجمعة افتتاح مسجد فريد نوعه ال... culture\n",
"5 نعت وزارة الشّؤون الثّقافيّة المنشد الصّوفي عز... culture\n",
"6 أعلنت وزارة الشؤون الثقافية بلاغ اليوم الاثنين... culture\n",
"7 استضاف برنامج سينما سينما الأربعاء 18 جانفي ... culture\n",
"8 تنطلق فعاليات التظاهرة الموسيقية الالكترونية ص... culture\n",
"9 ينطلق مهرجان القيروان للشعر العربي ببيت الشعر ... culture"
]
},
"metadata": {
"tags": []
},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Z4GfpaxZlW9q",
"colab_type": "code",
"colab": {}
},
"source": [
"# create a count vectorizer object\n",
"count_vectorizer = CountVectorizer(max_features=1000)\n",
"# fit the count vectorizer using the text data\n",
"feature_count = count_vectorizer.fit(trainingData['text'])\n",
"x_count = feature_count.transform(trainingData['text']).toarray()\n",
"y_count = trainingData.iloc[:,1].values\n",
"#x_count\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8VzLOIMQladR",
"colab_type": "code",
"colab": {}
},
"source": [
"# tfidf \n",
"vectorizer = TfidfVectorizer(max_features=1000)\n",
"feature_tfidi = vectorizer.fit(trainingData['text'])\n",
"x_tfidf = feature_tfidi.transform(trainingData['text']).toarray()\n",
"y_tfidf = trainingData.iloc[:,1].values\n",
"#x"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "OZAmjLWeledj",
"colab_type": "code",
"outputId": "f4621dca-a87e-4d89-fb2e-c3b058b0e541",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
}
},
"source": [
"print('======= count vectorizer results =================')\n",
"results = pd.DataFrame(columns=['LinearSVC', 'MultinomialNB',\n",
" 'BernoulliNB', 'SGDClassifier', 'DecisionTreeClassifier',\n",
" 'RandomForestClassifier',\n",
" 'AdaBoostClassifier', 'KNeighborsClassifier', 'ensemble-Learning' , 'scikit_log_reg'], index=['result'])\n",
"results.loc['result'] = evaluate_classification(x_count ,y_count).accuracy"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"======= count vectorizer results =================\n",
"Result For LinearSVC\n",
"Accuracy: 0.7319189971070396\n",
"Confusion Matrix:\n",
" [[ 5 1 1 0 8 1 0 2 1]\n",
" [ 2 65 0 19 9 2 4 1 4]\n",
" [ 0 4 18 0 22 0 2 0 0]\n",
" [ 1 14 4 185 21 11 4 6 2]\n",
" [ 7 15 29 43 757 49 85 8 2]\n",
" [ 0 1 0 1 48 98 1 0 0]\n",
" [ 1 2 2 2 86 1 120 0 0]\n",
" [ 0 4 0 2 3 1 0 264 0]\n",
" [ 2 7 1 1 5 0 0 1 6]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.28 0.26 0.27 19\n",
" diverse 0.58 0.61 0.59 106\n",
" economy 0.33 0.39 0.36 46\n",
"internationalNews 0.73 0.75 0.74 248\n",
" localnews 0.79 0.76 0.77 995\n",
" politic 0.60 0.66 0.63 149\n",
" society 0.56 0.56 0.56 214\n",
" sport 0.94 0.96 0.95 274\n",
" technology 0.40 0.26 0.32 23\n",
"\n",
" accuracy 0.73 2074\n",
" macro avg 0.58 0.58 0.58 2074\n",
" weighted avg 0.73 0.73 0.73 2074\n",
"\n",
"Result For MultinomialNB\n",
"Accuracy: 0.6899710703953713\n",
"Confusion Matrix:\n",
" [[ 9 1 0 0 8 0 0 0 1]\n",
" [ 3 76 1 18 0 0 1 2 5]\n",
" [ 1 2 32 0 8 0 2 0 1]\n",
" [ 6 13 3 209 3 7 4 1 2]\n",
" [ 26 9 43 28 542 128 209 2 8]\n",
" [ 2 0 0 5 14 128 0 0 0]\n",
" [ 2 3 1 0 35 1 172 0 0]\n",
" [ 6 3 1 2 5 2 0 253 2]\n",
" [ 1 6 2 1 3 0 0 0 10]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.16 0.47 0.24 19\n",
" diverse 0.67 0.72 0.69 106\n",
" economy 0.39 0.70 0.50 46\n",
"internationalNews 0.79 0.84 0.82 248\n",
" localnews 0.88 0.54 0.67 995\n",
" politic 0.48 0.86 0.62 149\n",
" society 0.44 0.80 0.57 214\n",
" sport 0.98 0.92 0.95 274\n",
" technology 0.34 0.43 0.38 23\n",
"\n",
" accuracy 0.69 2074\n",
" macro avg 0.57 0.70 0.60 2074\n",
" weighted avg 0.77 0.69 0.70 2074\n",
"\n",
"Result For BernoulliNB\n",
"Accuracy: 0.6822565091610414\n",
"Confusion Matrix:\n",
" [[ 8 1 1 0 5 0 1 2 1]\n",
" [ 1 78 1 21 0 0 0 3 2]\n",
" [ 1 2 34 0 7 0 2 0 0]\n",
" [ 3 14 3 211 4 7 3 3 0]\n",
" [ 23 14 53 37 519 131 202 9 7]\n",
" [ 1 0 1 4 17 125 0 1 0]\n",
" [ 1 4 1 3 30 0 173 2 0]\n",
" [ 3 1 0 2 2 1 0 264 1]\n",
" [ 1 13 3 1 2 0 0 0 3]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.19 0.42 0.26 19\n",
" diverse 0.61 0.74 0.67 106\n",
" economy 0.35 0.74 0.48 46\n",
"internationalNews 0.76 0.85 0.80 248\n",
" localnews 0.89 0.52 0.66 995\n",
" politic 0.47 0.84 0.61 149\n",
" society 0.45 0.81 0.58 214\n",
" sport 0.93 0.96 0.95 274\n",
" technology 0.21 0.13 0.16 23\n",
"\n",
" accuracy 0.68 2074\n",
" macro avg 0.54 0.67 0.57 2074\n",
" weighted avg 0.76 0.68 0.69 2074\n",
"\n",
"Result For SGDClassifier\n",
"Accuracy: 0.73625843780135\n",
"Confusion Matrix:\n",
" [[ 3 2 1 0 8 0 1 3 1]\n",
" [ 1 61 0 28 10 1 0 2 3]\n",
" [ 1 4 19 0 20 0 2 0 0]\n",
" [ 0 9 3 209 17 6 1 3 0]\n",
" [ 7 17 32 45 747 42 99 5 1]\n",
" [ 0 1 2 2 56 88 0 0 0]\n",
" [ 0 2 2 3 73 1 133 0 0]\n",
" [ 0 3 0 4 3 0 0 264 0]\n",
" [ 2 7 1 1 7 0 1 1 3]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.21 0.16 0.18 19\n",
" diverse 0.58 0.58 0.58 106\n",
" economy 0.32 0.41 0.36 46\n",
"internationalNews 0.72 0.84 0.77 248\n",
" localnews 0.79 0.75 0.77 995\n",
" politic 0.64 0.59 0.61 149\n",
" society 0.56 0.62 0.59 214\n",
" sport 0.95 0.96 0.96 274\n",
" technology 0.38 0.13 0.19 23\n",
"\n",
" accuracy 0.74 2074\n",
" macro avg 0.57 0.56 0.56 2074\n",
" weighted avg 0.74 0.74 0.74 2074\n",
"\n",
"Result For DecisionTreeClassifier\n",
"Accuracy: 0.5814850530376084\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 19 0 0 0 0]\n",
" [ 0 0 0 11 95 0 0 0 0]\n",
" [ 0 0 0 0 44 0 2 0 0]\n",
" [ 0 0 0 33 215 0 0 0 0]\n",
" [ 0 0 0 6 943 18 28 0 0]\n",
" [ 0 0 0 0 122 27 0 0 0]\n",
" [ 0 0 0 0 156 0 58 0 0]\n",
" [ 0 0 0 3 126 0 0 145 0]\n",
" [ 0 0 0 0 23 0 0 0 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.00 0.00 0.00 106\n",
" economy 0.00 0.00 0.00 46\n",
"internationalNews 0.62 0.13 0.22 248\n",
" localnews 0.54 0.95 0.69 995\n",
" politic 0.60 0.18 0.28 149\n",
" society 0.66 0.27 0.38 214\n",
" sport 1.00 0.53 0.69 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.58 2074\n",
" macro avg 0.38 0.23 0.25 2074\n",
" weighted avg 0.58 0.58 0.51 2074\n",
"\n",
"Result For RandomForestClassifier\n",
"Accuracy: 0.4802314368370299\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 19 0 0 0 0]\n",
" [ 0 0 0 0 106 0 0 0 0]\n",
" [ 0 0 0 0 46 0 0 0 0]\n",
" [ 0 0 0 0 248 0 0 0 0]\n",
" [ 0 0 0 0 995 0 0 0 0]\n",
" [ 0 0 0 0 149 0 0 0 0]\n",
" [ 0 0 0 0 214 0 0 0 0]\n",
" [ 0 0 0 0 273 0 0 1 0]\n",
" [ 0 0 0 0 23 0 0 0 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.00 0.00 0.00 106\n",
" economy 0.00 0.00 0.00 46\n",
"internationalNews 0.00 0.00 0.00 248\n",
" localnews 0.48 1.00 0.65 995\n",
" politic 0.00 0.00 0.00 149\n",
" society 0.00 0.00 0.00 214\n",
" sport 1.00 0.00 0.01 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.48 2074\n",
" macro avg 0.16 0.11 0.07 2074\n",
" weighted avg 0.36 0.48 0.31 2074\n",
"\n",
"Result For AdaBoostClassifier\n",
"Accuracy: 0.6634522661523626\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 18 0 0 1 0]\n",
" [ 0 5 0 32 69 0 0 0 0]\n",
" [ 0 0 4 0 42 0 0 0 0]\n",
" [ 0 0 0 119 126 1 0 2 0]\n",
" [ 0 0 1 7 972 2 12 1 0]\n",
" [ 0 0 0 0 137 12 0 0 0]\n",
" [ 0 0 0 0 186 0 28 0 0]\n",
" [ 0 0 0 0 38 0 0 236 0]\n",
" [ 0 0 0 1 21 0 0 1 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 1.00 0.05 0.09 106\n",
" economy 0.80 0.09 0.16 46\n",
"internationalNews 0.75 0.48 0.58 248\n",
" localnews 0.60 0.98 0.75 995\n",
" politic 0.80 0.08 0.15 149\n",
" society 0.70 0.13 0.22 214\n",
" sport 0.98 0.86 0.92 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.66 2074\n",
" macro avg 0.63 0.30 0.32 2074\n",
" weighted avg 0.71 0.66 0.59 2074\n",
"\n",
"Result For KNeighborsClassifier\n",
"Accuracy: 0.5607521697203471\n",
"Confusion Matrix:\n",
" [[ 2 1 1 1 11 0 0 2 1]\n",
" [ 2 31 1 11 55 0 0 4 2]\n",
" [ 1 2 13 1 23 1 2 3 0]\n",
" [ 0 37 2 83 109 3 0 13 1]\n",
" [ 6 70 26 43 735 28 49 38 0]\n",
" [ 0 11 1 6 77 54 0 0 0]\n",
" [ 1 11 0 4 136 0 61 1 0]\n",
" [ 1 16 0 7 69 0 0 181 0]\n",
" [ 0 5 3 1 11 0 0 0 3]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.15 0.11 0.12 19\n",
" diverse 0.17 0.29 0.21 106\n",
" economy 0.28 0.28 0.28 46\n",
"internationalNews 0.53 0.33 0.41 248\n",
" localnews 0.60 0.74 0.66 995\n",
" politic 0.63 0.36 0.46 149\n",
" society 0.54 0.29 0.37 214\n",
" sport 0.75 0.66 0.70 274\n",
" technology 0.43 0.13 0.20 23\n",
"\n",
" accuracy 0.56 2074\n",
" macro avg 0.45 0.35 0.38 2074\n",
" weighted avg 0.57 0.56 0.55 2074\n",
"\n",
"Result For ensemble-Learning\n",
"Accuracy: 0.7671166827386693\n",
"Confusion Matrix:\n",
" [[ 7 0 1 0 8 0 0 2 1]\n",
" [ 2 69 1 17 9 2 2 2 2]\n",
" [ 0 3 22 0 19 0 2 0 0]\n",
" [ 1 11 3 208 15 8 1 1 0]\n",
" [ 7 13 29 37 766 46 94 2 1]\n",
" [ 0 1 1 2 39 106 0 0 0]\n",
" [ 0 3 2 1 67 1 140 0 0]\n",
" [ 0 4 0 1 2 0 0 267 0]\n",
" [ 2 7 2 1 4 0 0 1 6]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.37 0.37 0.37 19\n",
" diverse 0.62 0.65 0.64 106\n",
" economy 0.36 0.48 0.41 46\n",
"internationalNews 0.78 0.84 0.81 248\n",
" localnews 0.82 0.77 0.80 995\n",
" politic 0.65 0.71 0.68 149\n",
" society 0.59 0.65 0.62 214\n",
" sport 0.97 0.97 0.97 274\n",
" technology 0.60 0.26 0.36 23\n",
"\n",
" accuracy 0.77 2074\n",
" macro avg 0.64 0.63 0.63 2074\n",
" weighted avg 0.77 0.77 0.77 2074\n",
"\n",
"[LibLinear]Result For scikit_log_reg\n",
"Accuracy: 0.7492767598842816\n",
"Confusion Matrix:\n",
" [[ 4 0 1 0 11 0 0 2 1]\n",
" [ 3 62 0 19 14 1 3 1 3]\n",
" [ 0 2 20 0 22 0 2 0 0]\n",
" [ 1 14 3 192 27 9 1 1 0]\n",
" [ 4 10 23 39 797 42 77 2 1]\n",
" [ 0 0 1 2 57 89 0 0 0]\n",
" [ 0 2 2 2 89 1 118 0 0]\n",
" [ 0 3 0 1 3 0 0 267 0]\n",
" [ 2 7 0 1 7 0 0 1 5]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.29 0.21 0.24 19\n",
" diverse 0.62 0.58 0.60 106\n",
" economy 0.40 0.43 0.42 46\n",
"internationalNews 0.75 0.77 0.76 248\n",
" localnews 0.78 0.80 0.79 995\n",
" politic 0.63 0.60 0.61 149\n",
" society 0.59 0.55 0.57 214\n",
" sport 0.97 0.97 0.97 274\n",
" technology 0.50 0.22 0.30 23\n",
"\n",
" accuracy 0.75 2074\n",
" macro avg 0.61 0.57 0.59 2074\n",
" weighted avg 0.75 0.75 0.75 2074\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "c1i0vYSplifv",
"colab_type": "code",
"outputId": "8b53ffe4-636a-44fb-ebf4-57704fa1ffda",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 118
}
},
"source": [
"results"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LinearSVC</th>\n",
" <th>MultinomialNB</th>\n",
" <th>BernoulliNB</th>\n",
" <th>SGDClassifier</th>\n",
" <th>DecisionTreeClassifier</th>\n",
" <th>RandomForestClassifier</th>\n",
" <th>AdaBoostClassifier</th>\n",
" <th>KNeighborsClassifier</th>\n",
" <th>ensemble-Learning</th>\n",
" <th>scikit_log_reg</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>result</th>\n",
" <td>0.731919</td>\n",
" <td>0.689971</td>\n",
" <td>0.682257</td>\n",
" <td>0.736258</td>\n",
" <td>0.581485</td>\n",
" <td>0.480231</td>\n",
" <td>0.663452</td>\n",
" <td>0.560752</td>\n",
" <td>0.767117</td>\n",
" <td>0.749277</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LinearSVC MultinomialNB ... ensemble-Learning scikit_log_reg\n",
"result 0.731919 0.689971 ... 0.767117 0.749277\n",
"\n",
"[1 rows x 10 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "L-ESIytxllgA",
"colab_type": "code",
"outputId": "3b056359-815f-4466-b85e-2525b3f85c19",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
}
},
"source": [
"print('======= TfidfVectorizer results =================')\n",
"results = pd.DataFrame(columns=['LinearSVC', 'MultinomialNB',\n",
" 'BernoulliNB', 'SGDClassifier', 'DecisionTreeClassifier',\n",
" 'RandomForestClassifier',\n",
" 'AdaBoostClassifier', 'KNeighborsClassifier', 'ensemble-Learning' , 'scikit_log_reg'], index=['result'])\n",
"results.loc['result'] = evaluate_classification(x_tfidf ,y_tfidf).accuracy"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"======= TfidfVectorizer results =================\n",
"Result For LinearSVC\n",
"Accuracy: 0.7690453230472517\n",
"Confusion Matrix:\n",
" [[ 3 2 0 0 10 0 1 2 1]\n",
" [ 0 62 0 24 13 0 1 1 5]\n",
" [ 0 2 18 0 24 0 2 0 0]\n",
" [ 1 11 3 207 20 5 0 1 0]\n",
" [ 3 8 22 29 828 36 66 2 1]\n",
" [ 0 0 1 5 60 83 0 0 0]\n",
" [ 0 2 0 1 91 0 120 0 0]\n",
" [ 0 2 0 2 1 0 0 269 0]\n",
" [ 2 7 1 2 5 0 0 1 5]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.33 0.16 0.21 19\n",
" diverse 0.65 0.58 0.61 106\n",
" economy 0.40 0.39 0.40 46\n",
"internationalNews 0.77 0.83 0.80 248\n",
" localnews 0.79 0.83 0.81 995\n",
" politic 0.67 0.56 0.61 149\n",
" society 0.63 0.56 0.59 214\n",
" sport 0.97 0.98 0.98 274\n",
" technology 0.42 0.22 0.29 23\n",
"\n",
" accuracy 0.77 2074\n",
" macro avg 0.63 0.57 0.59 2074\n",
" weighted avg 0.76 0.77 0.76 2074\n",
"\n",
"Result For MultinomialNB\n",
"Accuracy: 0.7492767598842816\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 16 0 0 3 0]\n",
" [ 0 37 1 32 34 0 0 2 0]\n",
" [ 0 1 16 0 28 0 1 0 0]\n",
" [ 0 1 1 194 47 3 1 1 0]\n",
" [ 0 2 3 23 810 44 112 1 0]\n",
" [ 0 0 0 2 56 91 0 0 0]\n",
" [ 0 0 0 0 67 0 147 0 0]\n",
" [ 0 1 0 1 12 1 0 259 0]\n",
" [ 0 6 1 3 12 0 0 1 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.77 0.35 0.48 106\n",
" economy 0.73 0.35 0.47 46\n",
"internationalNews 0.76 0.78 0.77 248\n",
" localnews 0.75 0.81 0.78 995\n",
" politic 0.65 0.61 0.63 149\n",
" society 0.56 0.69 0.62 214\n",
" sport 0.97 0.95 0.96 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.75 2074\n",
" macro avg 0.58 0.50 0.52 2074\n",
" weighted avg 0.74 0.75 0.74 2074\n",
"\n",
"Result For BernoulliNB\n",
"Accuracy: 0.6822565091610414\n",
"Confusion Matrix:\n",
" [[ 8 1 1 0 5 0 1 2 1]\n",
" [ 1 78 1 21 0 0 0 3 2]\n",
" [ 1 2 34 0 7 0 2 0 0]\n",
" [ 3 14 3 211 4 7 3 3 0]\n",
" [ 23 14 53 37 519 131 202 9 7]\n",
" [ 1 0 1 4 17 125 0 1 0]\n",
" [ 1 4 1 3 30 0 173 2 0]\n",
" [ 3 1 0 2 2 1 0 264 1]\n",
" [ 1 13 3 1 2 0 0 0 3]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.19 0.42 0.26 19\n",
" diverse 0.61 0.74 0.67 106\n",
" economy 0.35 0.74 0.48 46\n",
"internationalNews 0.76 0.85 0.80 248\n",
" localnews 0.89 0.52 0.66 995\n",
" politic 0.47 0.84 0.61 149\n",
" society 0.45 0.81 0.58 214\n",
" sport 0.93 0.96 0.95 274\n",
" technology 0.21 0.13 0.16 23\n",
"\n",
" accuracy 0.68 2074\n",
" macro avg 0.54 0.67 0.57 2074\n",
" weighted avg 0.76 0.68 0.69 2074\n",
"\n",
"Result For SGDClassifier\n",
"Accuracy: 0.7675988428158148\n",
"Confusion Matrix:\n",
" [[ 4 2 0 0 10 0 1 1 1]\n",
" [ 0 57 1 25 16 0 2 1 4]\n",
" [ 0 1 17 0 27 0 1 0 0]\n",
" [ 0 7 4 211 21 4 0 1 0]\n",
" [ 2 7 12 27 847 32 66 2 0]\n",
" [ 0 0 0 5 75 69 0 0 0]\n",
" [ 0 2 0 1 93 0 118 0 0]\n",
" [ 0 1 0 2 7 0 0 264 0]\n",
" [ 2 6 1 2 6 0 0 1 5]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.50 0.21 0.30 19\n",
" diverse 0.69 0.54 0.60 106\n",
" economy 0.49 0.37 0.42 46\n",
"internationalNews 0.77 0.85 0.81 248\n",
" localnews 0.77 0.85 0.81 995\n",
" politic 0.66 0.46 0.54 149\n",
" society 0.63 0.55 0.59 214\n",
" sport 0.98 0.96 0.97 274\n",
" technology 0.50 0.22 0.30 23\n",
"\n",
" accuracy 0.77 2074\n",
" macro avg 0.66 0.56 0.59 2074\n",
" weighted avg 0.76 0.77 0.76 2074\n",
"\n",
"Result For DecisionTreeClassifier\n",
"Accuracy: 0.5800385728061717\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 19 0 0 0 0]\n",
" [ 0 0 0 11 95 0 0 0 0]\n",
" [ 0 0 0 0 45 0 1 0 0]\n",
" [ 0 0 0 32 216 0 0 0 0]\n",
" [ 0 0 0 6 952 18 19 0 0]\n",
" [ 0 0 0 0 122 27 0 0 0]\n",
" [ 0 0 0 0 166 0 48 0 0]\n",
" [ 1 0 0 3 126 0 0 144 0]\n",
" [ 0 0 0 0 23 0 0 0 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.00 0.00 0.00 106\n",
" economy 0.00 0.00 0.00 46\n",
"internationalNews 0.62 0.13 0.21 248\n",
" localnews 0.54 0.96 0.69 995\n",
" politic 0.60 0.18 0.28 149\n",
" society 0.71 0.22 0.34 214\n",
" sport 1.00 0.53 0.69 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.58 2074\n",
" macro avg 0.38 0.22 0.25 2074\n",
" weighted avg 0.58 0.58 0.50 2074\n",
"\n",
"Result For RandomForestClassifier\n",
"Accuracy: 0.47974927675988427\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 19 0 0 0 0]\n",
" [ 0 0 0 0 106 0 0 0 0]\n",
" [ 0 0 0 0 46 0 0 0 0]\n",
" [ 0 0 0 0 248 0 0 0 0]\n",
" [ 0 0 0 0 995 0 0 0 0]\n",
" [ 0 0 0 0 149 0 0 0 0]\n",
" [ 0 0 0 0 214 0 0 0 0]\n",
" [ 0 0 0 0 274 0 0 0 0]\n",
" [ 0 0 0 0 23 0 0 0 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.00 0.00 0.00 106\n",
" economy 0.00 0.00 0.00 46\n",
"internationalNews 0.00 0.00 0.00 248\n",
" localnews 0.48 1.00 0.65 995\n",
" politic 0.00 0.00 0.00 149\n",
" society 0.00 0.00 0.00 214\n",
" sport 0.00 0.00 0.00 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.48 2074\n",
" macro avg 0.05 0.11 0.07 2074\n",
" weighted avg 0.23 0.48 0.31 2074\n",
"\n",
"Result For AdaBoostClassifier\n",
"Accuracy: 0.47974927675988427\n",
"Confusion Matrix:\n",
" [[ 0 0 0 0 19 0 0 0 0]\n",
" [ 0 0 0 0 106 0 0 0 0]\n",
" [ 0 0 0 0 46 0 0 0 0]\n",
" [ 0 0 0 0 248 0 0 0 0]\n",
" [ 0 0 0 0 995 0 0 0 0]\n",
" [ 0 0 0 0 149 0 0 0 0]\n",
" [ 0 0 0 0 214 0 0 0 0]\n",
" [ 0 0 0 0 274 0 0 0 0]\n",
" [ 0 0 0 0 23 0 0 0 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.00 0.00 0.00 106\n",
" economy 0.00 0.00 0.00 46\n",
"internationalNews 0.00 0.00 0.00 248\n",
" localnews 0.48 1.00 0.65 995\n",
" politic 0.00 0.00 0.00 149\n",
" society 0.00 0.00 0.00 214\n",
" sport 0.00 0.00 0.00 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.48 2074\n",
" macro avg 0.05 0.11 0.07 2074\n",
" weighted avg 0.23 0.48 0.31 2074\n",
"\n",
"Result For KNeighborsClassifier\n",
"Accuracy: 0.44792671166827386\n",
"Confusion Matrix:\n",
" [[ 0 1 0 5 11 0 0 2 0]\n",
" [ 0 6 0 13 69 0 0 18 0]\n",
" [ 0 2 8 1 30 0 0 4 1]\n",
" [ 0 21 0 68 127 4 0 28 0]\n",
" [ 1 29 11 140 694 12 6 102 0]\n",
" [ 0 33 0 12 72 19 0 13 0]\n",
" [ 0 12 1 19 154 0 14 14 0]\n",
" [ 0 26 0 14 114 0 0 120 0]\n",
" [ 1 2 1 3 13 0 0 3 0]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.00 0.00 0.00 19\n",
" diverse 0.05 0.06 0.05 106\n",
" economy 0.38 0.17 0.24 46\n",
"internationalNews 0.25 0.27 0.26 248\n",
" localnews 0.54 0.70 0.61 995\n",
" politic 0.54 0.13 0.21 149\n",
" society 0.70 0.07 0.12 214\n",
" sport 0.39 0.44 0.42 274\n",
" technology 0.00 0.00 0.00 23\n",
"\n",
" accuracy 0.45 2074\n",
" macro avg 0.32 0.20 0.21 2074\n",
" weighted avg 0.46 0.45 0.41 2074\n",
"\n",
"Result For ensemble-Learning\n",
"Accuracy: 0.7753134040501446\n",
"Confusion Matrix:\n",
" [[ 3 2 0 0 10 0 1 2 1]\n",
" [ 0 56 1 28 16 0 1 1 3]\n",
" [ 0 1 17 0 26 0 2 0 0]\n",
" [ 0 8 3 213 19 4 0 1 0]\n",
" [ 2 9 13 31 839 37 63 1 0]\n",
" [ 0 0 0 3 66 80 0 0 0]\n",
" [ 0 2 0 1 84 0 127 0 0]\n",
" [ 0 0 0 2 4 0 0 268 0]\n",
" [ 2 6 1 2 6 0 0 1 5]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.43 0.16 0.23 19\n",
" diverse 0.67 0.53 0.59 106\n",
" economy 0.49 0.37 0.42 46\n",
"internationalNews 0.76 0.86 0.81 248\n",
" localnews 0.78 0.84 0.81 995\n",
" politic 0.66 0.54 0.59 149\n",
" society 0.65 0.59 0.62 214\n",
" sport 0.98 0.98 0.98 274\n",
" technology 0.56 0.22 0.31 23\n",
"\n",
" accuracy 0.78 2074\n",
" macro avg 0.66 0.56 0.60 2074\n",
" weighted avg 0.77 0.78 0.77 2074\n",
"\n",
"[LibLinear]Result For scikit_log_reg\n",
"Accuracy: 0.7743490838958534\n",
"Confusion Matrix:\n",
" [[ 2 2 0 0 12 0 1 2 0]\n",
" [ 0 57 0 27 18 0 1 1 2]\n",
" [ 0 1 16 0 27 0 2 0 0]\n",
" [ 0 7 2 210 24 4 0 1 0]\n",
" [ 2 9 11 29 854 35 54 1 0]\n",
" [ 0 0 0 3 72 74 0 0 0]\n",
" [ 0 2 0 1 90 0 121 0 0]\n",
" [ 0 1 0 2 4 0 0 267 0]\n",
" [ 2 6 1 2 6 0 0 1 5]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" culture 0.33 0.11 0.16 19\n",
" diverse 0.67 0.54 0.60 106\n",
" economy 0.53 0.35 0.42 46\n",
"internationalNews 0.77 0.85 0.80 248\n",
" localnews 0.77 0.86 0.81 995\n",
" politic 0.65 0.50 0.56 149\n",
" society 0.68 0.57 0.62 214\n",
" sport 0.98 0.97 0.98 274\n",
" technology 0.71 0.22 0.33 23\n",
"\n",
" accuracy 0.77 2074\n",
" macro avg 0.68 0.55 0.59 2074\n",
" weighted avg 0.76 0.77 0.76 2074\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "SaBbgdp8lph_",
"colab_type": "code",
"outputId": "ecf83a40-4c2d-4833-c990-6ab569240ac4",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 118
}
},
"source": [
"results"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>LinearSVC</th>\n",
" <th>MultinomialNB</th>\n",
" <th>BernoulliNB</th>\n",
" <th>SGDClassifier</th>\n",
" <th>DecisionTreeClassifier</th>\n",
" <th>RandomForestClassifier</th>\n",
" <th>AdaBoostClassifier</th>\n",
" <th>KNeighborsClassifier</th>\n",
" <th>ensemble-Learning</th>\n",
" <th>scikit_log_reg</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>result</th>\n",
" <td>0.769045</td>\n",
" <td>0.749277</td>\n",
" <td>0.682257</td>\n",
" <td>0.767599</td>\n",
" <td>0.580039</td>\n",
" <td>0.479749</td>\n",
" <td>0.479749</td>\n",
" <td>0.447927</td>\n",
" <td>0.775313</td>\n",
" <td>0.774349</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" LinearSVC MultinomialNB ... ensemble-Learning scikit_log_reg\n",
"result 0.769045 0.749277 ... 0.775313 0.774349\n",
"\n",
"[1 rows x 10 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "-m46ZazLlvf-",
"colab_type": "code",
"outputId": "c610fdc2-a02c-46ba-fb3f-148f33645826",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 283
}
},
"source": [
"cf_matrix = [[3 , 2, 0, 0, 10, 0, 1, 2, 1],\n",
" [ 0, 56, 1, 28, 16 , 0 , 1 , 1 , 3],\n",
" [ 0 , 1 , 17 , 0 , 26 , 0 , 2 , 0 , 0],\n",
" [ 0 , 8 , 3 ,213 ,19 ,4 ,0 ,1 ,0],\n",
" [ 2 ,9 , 13 , 31 ,839 , 37 , 63 , 1 , 0],\n",
" [ 0 , 0 , 0 , 3 ,66 , 80 , 0 , 0 , 0],\n",
" [ 0, 2 , 0 , 1 ,84 , 0 ,127 , 0 , 0],\n",
" [ 0 , 0 , 0 , 2 , 4 ,0 , 0 ,268 , 0],\n",
" [ 2 , 6 , 1 , 2 , 6 ,0 ,0 , 1 , 5]]\n",
"import seaborn as sns\n",
"sns.heatmap(cf_matrix, annot=True)"
],
"execution_count": 0,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f2f7be80cf8>"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
},
{
"output_type": "display_data",
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 2 Axes>"
]
},
"metadata": {
"tags": [],
"needs_background": "light"
}
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment