{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7c067bf9",
   "metadata": {},
   "source": [
    "# Naive Bayes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da2a88bd",
   "metadata": {},
   "source": [
    "[Tutorial - How to build a Spam Classifier in python and sklearn - milindsoorya.site](https://milindsoorya.site/blog/build-a-spam-classifier-in-python)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d5deff12",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_spam = ['send us your password', 'review our website', 'send your password', 'send us your account']\n",
    "train_ham = ['Your activity report','benefits physical activity', 'the importance vows']\n",
    "test_spam = ['renew your password', 'renew your vows']\n",
    "test_ham = ['benefits of our account', 'the importance of physical activity']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "30220879",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "a7b271ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>send us your password</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>review our website</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>send your password</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>send us your account</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>renew your password</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>renew your vows</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Your activity report</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>benefits physical activity</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>the importance vows</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>benefits of our account</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>the importance of physical activity</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   text  label\n",
       "0                 send us your password      1\n",
       "1                    review our website      1\n",
       "2                    send your password      1\n",
       "3                  send us your account      1\n",
       "4                   renew your password      1\n",
       "5                       renew your vows      1\n",
       "6                  Your activity report      0\n",
       "7            benefits physical activity      0\n",
       "8                   the importance vows      0\n",
       "9               benefits of our account      0\n",
       "10  the importance of physical activity      0"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame({\n",
    "    'text': train_spam + test_spam + train_ham + test_ham,\n",
    "    'label': [1] * (len(train_spam) + len(test_spam)) + [0] * (len(train_ham) + len(test_ham))\n",
    "})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "c514e14f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /home/iz/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')\n",
    "\n",
    "#remove the punctuations and stopwords\n",
    "import string\n",
    "\n",
    "def text_process(text):\n",
    "\n",
    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
    "    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]\n",
    "\n",
    "    return \" \".join(text)\n",
    "\n",
    "text = data['text'].apply(text_process)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "56b4071a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words in data set:  13\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "\n",
    "total_counts = Counter()\n",
    "for i in range(len(text)):\n",
    "    for word in text.values[i].split(' '):\n",
    "        total_counts[word] += 1\n",
    "\n",
    "print(\"Total words in data set: \", len(total_counts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "a9a547d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['send', 'password', 'activity', 'us', 'account', 'renew', 'vows', 'benefits', 'physical', 'importance', 'review', 'website', 'report']\n"
     ]
    }
   ],
   "source": [
    "vocab = sorted(total_counts, key=total_counts.get, reverse=True)\n",
    "print(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "19090053",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'send': 0,\n",
       " 'password': 1,\n",
       " 'activity': 2,\n",
       " 'us': 3,\n",
       " 'account': 4,\n",
       " 'renew': 5,\n",
       " 'vows': 6,\n",
       " 'benefits': 7,\n",
       " 'physical': 8,\n",
       " 'importance': 9,\n",
       " 'review': 10,\n",
       " 'website': 11,\n",
       " 'report': 12}"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_size = len(vocab)\n",
    "word2idx = {}\n",
    "#print vocab_size\n",
    "for i, word in enumerate(vocab):\n",
    "    word2idx[word] = i\n",
    "    \n",
    "word2idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "b6a56502",
   "metadata": {},
   "outputs": [],
   "source": [
    "def text_to_vector(text):\n",
    "    word_vector = np.zeros(vocab_size)\n",
    "    for word in text.split(\" \"):\n",
    "        if word2idx.get(word) is None:\n",
    "            continue\n",
    "        else:\n",
    "            word_vector[word2idx.get(word)] += 1\n",
    "    return np.array(word_vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "637cedcd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],\n",
       "        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "        [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],\n",
       "        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],\n",
       "        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],\n",
       "        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],\n",
       "        [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]]),\n",
       " (11, 13))"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "word_vectors = np.zeros((len(text), len(vocab)), dtype=np.int_)\n",
    "for i, t in enumerate(text):\n",
    "    word_vectors[i] = text_to_vector(t)\n",
    "\n",
    "word_vectors, word_vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "bdaf9862",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(word_vectors, data['label'], test_size=0.5, random_state=111)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "ba29b127",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "7bb9c0e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "mnb = MultinomialNB(alpha=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "cb721a3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train(clf, features, targets):\n",
    "    clf.fit(features, targets)\n",
    "\n",
    "def predict(clf, features):\n",
    "    return (clf.predict(features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "5b8e7af2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 1, 0, 0, 1, 1])"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_scores_word_vectors = []\n",
    "\n",
    "mnb.fit(X_train, y_train)\n",
    "mnb.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "ae59176e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5    1\n",
       "0    1\n",
       "7    0\n",
       "8    0\n",
       "2    1\n",
       "1    1\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "10be914f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.13545966, 0.86454034],\n",
       "       [0.00343377, 0.99656623],\n",
       "       [0.99661845, 0.00338155],\n",
       "       [0.84941176, 0.15058824],\n",
       "       [0.02544942, 0.97455058],\n",
       "       [0.48456376, 0.51543624]])"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mnb.predict_proba(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3231ca14",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}