# Naive Bayes

[Tutorial - How to build a Spam Classifier in python and sklearn - milindsoorya.site](https://milindsoorya.site/blog/build-a-spam-classifier-in-python)

In [1]:
train_spam = ['send us your password', 'review our website', 'send your password', 'send us your account']
train_ham = ['Your activity report','benefits physical activity', 'the importance vows']
test_spam = ['renew your password', 'renew your vows']
test_ham = ['benefits of our account', 'the importance of physical activity']

In [5]:
import pandas as pd

In [101]:
data = pd.DataFrame({
    'text': train_spam + test_spam + train_ham + test_ham,
    'label': [1] * (len(train_spam) + len(test_spam)) + [0] * (len(train_ham) + len(test_ham))
})
data

Unnamed: 0,text,label
0,send us your password,1
1,review our website,1
2,send your password,1
3,send us your account,1
4,renew your password,1
5,renew your vows,1
6,Your activity report,0
7,benefits physical activity,0
8,the importance vows,0
9,benefits of our account,0


In [69]:
import nltk
nltk.download('stopwords')

#remove the punctuations and stopwords
import string

def text_process(text):

    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]

    return " ".join(text)

text = data['text'].apply(text_process)

[nltk_data] Downloading package stopwords to /home/iz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
from collections import Counter

total_counts = Counter()
for i in range(len(text)):
    for word in text.values[i].split(' '):
        total_counts[word] += 1

print("Total words in data set: ", len(total_counts))

Total words in data set:  13


In [77]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)
print(vocab)

['send', 'password', 'activity', 'us', 'account', 'renew', 'vows', 'benefits', 'physical', 'importance', 'review', 'website', 'report']


In [78]:
vocab_size = len(vocab)
word2idx = {}
#print vocab_size
for i, word in enumerate(vocab):
    word2idx[word] = i
    
word2idx

{'send': 0,
 'password': 1,
 'activity': 2,
 'us': 3,
 'account': 4,
 'renew': 5,
 'vows': 6,
 'benefits': 7,
 'physical': 8,
 'importance': 9,
 'review': 10,
 'website': 11,
 'report': 12}

In [79]:
def text_to_vector(text):
    word_vector = np.zeros(vocab_size)
    for word in text.split(" "):
        if word2idx.get(word) is None:
            continue
        else:
            word_vector[word2idx.get(word)] += 1
    return np.array(word_vector)

In [80]:
import numpy as np

word_vectors = np.zeros((len(text), len(vocab)), dtype=np.int_)
for i, t in enumerate(text):
    word_vectors[i] = text_to_vector(t)

word_vectors, word_vectors.shape

(array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]]),
 (11, 13))

In [103]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(word_vectors, data['label'], test_size=0.5, random_state=111)

In [104]:
from sklearn.naive_bayes import MultinomialNB

In [105]:
mnb = MultinomialNB(alpha=0.2)

In [106]:
def train(clf, features, targets):
    clf.fit(features, targets)

def predict(clf, features):
    return (clf.predict(features))

In [111]:
pred_scores_word_vectors = []

mnb.fit(X_train, y_train)
mnb.predict(X_test)

array([1, 1, 0, 0, 1, 1])

In [112]:
y_test

5    1
0    1
7    0
8    0
2    1
1    1
Name: label, dtype: int64

In [113]:
mnb.predict_proba(X_test)

array([[0.13545966, 0.86454034],
       [0.00343377, 0.99656623],
       [0.99661845, 0.00338155],
       [0.84941176, 0.15058824],
       [0.02544942, 0.97455058],
       [0.48456376, 0.51543624]])