{ "cells": [ { "cell_type": "markdown", "id": "7c067bf9", "metadata": {}, "source": [ "# Naive Bayes" ] }, { "cell_type": "markdown", "id": "da2a88bd", "metadata": {}, "source": [ "[Tutorial - How to build a Spam Classifier in python and sklearn - milindsoorya.site](https://milindsoorya.site/blog/build-a-spam-classifier-in-python)" ] }, { "cell_type": "code", "execution_count": 1, "id": "d5deff12", "metadata": {}, "outputs": [], "source": [ "train_spam = ['send us your password', 'review our website', 'send your password', 'send us your account']\n", "train_ham = ['Your activity report','benefits physical activity', 'the importance vows']\n", "test_spam = ['renew your password', 'renew your vows']\n", "test_ham = ['benefits of our account', 'the importance of physical activity']" ] }, { "cell_type": "code", "execution_count": 5, "id": "30220879", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 101, "id": "a7b271ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textlabel
0send us your password1
1review our website1
2send your password1
3send us your account1
4renew your password1
5renew your vows1
6Your activity report0
7benefits physical activity0
8the importance vows0
9benefits of our account0
10the importance of physical activity0
\n", "
" ], "text/plain": [ " text label\n", "0 send us your password 1\n", "1 review our website 1\n", "2 send your password 1\n", "3 send us your account 1\n", "4 renew your password 1\n", "5 renew your vows 1\n", "6 Your activity report 0\n", "7 benefits physical activity 0\n", "8 the importance vows 0\n", "9 benefits of our account 0\n", "10 the importance of physical activity 0" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame({\n", " 'text': train_spam + test_spam + train_ham + test_ham,\n", " 'label': [1] * (len(train_spam) + len(test_spam)) + [0] * (len(train_ham) + len(test_ham))\n", "})\n", "data" ] }, { "cell_type": "code", "execution_count": 69, "id": "c514e14f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /home/iz/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n" ] } ], "source": [ "import nltk\n", "nltk.download('stopwords')\n", "\n", "#remove the punctuations and stopwords\n", "import string\n", "\n", "def text_process(text):\n", "\n", " text = text.translate(str.maketrans('', '', string.punctuation))\n", " text = [word for word in text.split() if word.lower() not in stopwords.words('english')]\n", "\n", " return \" \".join(text)\n", "\n", "text = data['text'].apply(text_process)" ] }, { "cell_type": "code", "execution_count": 70, "id": "56b4071a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total words in data set: 13\n" ] } ], "source": [ "from collections import Counter\n", "\n", "total_counts = Counter()\n", "for i in range(len(text)):\n", " for word in text.values[i].split(' '):\n", " total_counts[word] += 1\n", "\n", "print(\"Total words in data set: \", len(total_counts))" ] }, { "cell_type": "code", "execution_count": 77, "id": "a9a547d8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['send', 'password', 'activity', 'us', 'account', 'renew', 'vows', 'benefits', 'physical', 'importance', 'review', 'website', 'report']\n" ] } ], "source": [ "vocab = sorted(total_counts, key=total_counts.get, reverse=True)\n", "print(vocab)" ] }, { "cell_type": "code", "execution_count": 78, "id": "19090053", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'send': 0,\n", " 'password': 1,\n", " 'activity': 2,\n", " 'us': 3,\n", " 'account': 4,\n", " 'renew': 5,\n", " 'vows': 6,\n", " 'benefits': 7,\n", " 'physical': 8,\n", " 'importance': 9,\n", " 'review': 10,\n", " 'website': 11,\n", " 'report': 12}" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_size = len(vocab)\n", "word2idx = {}\n", "#print vocab_size\n", "for i, word in enumerate(vocab):\n", " word2idx[word] = i\n", " \n", "word2idx" ] }, { "cell_type": "code", "execution_count": 79, "id": "b6a56502", "metadata": {}, "outputs": [], "source": [ "def text_to_vector(text):\n", " word_vector = np.zeros(vocab_size)\n", " for word in text.split(\" \"):\n", " if word2idx.get(word) is None:\n", " continue\n", " else:\n", " word_vector[word2idx.get(word)] += 1\n", " return np.array(word_vector)" ] }, { "cell_type": "code", "execution_count": 80, "id": "637cedcd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],\n", " [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],\n", " [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],\n", " [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],\n", " [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]]),\n", " (11, 13))" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "word_vectors = np.zeros((len(text), len(vocab)), dtype=np.int_)\n", "for i, t in enumerate(text):\n", " word_vectors[i] = text_to_vector(t)\n", "\n", "word_vectors, word_vectors.shape" ] }, { "cell_type": "code", "execution_count": 103, "id": "bdaf9862", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(word_vectors, data['label'], test_size=0.5, random_state=111)" ] }, { "cell_type": "code", "execution_count": 104, "id": "ba29b127", "metadata": {}, "outputs": [], "source": [ "from sklearn.naive_bayes import MultinomialNB" ] }, { "cell_type": "code", "execution_count": 105, "id": "7bb9c0e5", "metadata": {}, "outputs": [], "source": [ "mnb = MultinomialNB(alpha=0.2)" ] }, { "cell_type": "code", "execution_count": 106, "id": "cb721a3e", "metadata": {}, "outputs": [], "source": [ "def train(clf, features, targets):\n", " clf.fit(features, targets)\n", "\n", "def predict(clf, features):\n", " return (clf.predict(features))" ] }, { "cell_type": "code", "execution_count": 111, "id": "5b8e7af2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 0, 0, 1, 1])" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pred_scores_word_vectors = []\n", "\n", "mnb.fit(X_train, y_train)\n", "mnb.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 112, "id": "ae59176e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5 1\n", "0 1\n", "7 0\n", "8 0\n", "2 1\n", "1 1\n", "Name: label, dtype: int64" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test" ] }, { "cell_type": "code", "execution_count": 113, "id": "10be914f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.13545966, 0.86454034],\n", " [0.00343377, 0.99656623],\n", " [0.99661845, 0.00338155],\n", " [0.84941176, 0.15058824],\n", " [0.02544942, 0.97455058],\n", " [0.48456376, 0.51543624]])" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mnb.predict_proba(X_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "3231ca14", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }