I'm curious about the probability extraction by reading external data after wordvec natural language processing.

From data with the current id/date/text/smishing row We extracted only text/smishing, processed natural language with word2vec, and learned it with random forest and KNN. We want to extract whether or not this learning is smishing by reading data with only id/date/text lines called test.csv.

The following codes are the ones you trained on.

import numpy as np
import pandas as pd
import konlpy
ham_spam = pd.read_csv(r'C:\Users\chlwn\Desktop\JUNO\data\smishing data\train.csv')
data = list(zip(ham_spam['text'],ham_spam['smishing']))
text = [data[i][0] for i in range(30000)]
label = [data[i][1] for i in range(30000)]

import re
text1 = [re.sub('\d+',' ',tmp) for tmp in text]
text2 = [re.sub('\W+',' ',tmp) for tmp in text1]
text_split = [tmp.split(' ') for tmp in text2]
text_split
from sklearn.model_selection import train_test_split
text_train, text_test, label_train, label_test = train_test_split(text_split, label, random_state = 0)
#####Sharing the train and test of the text (mail content)
##### The correct answer label (ham or spam) is also divided into train and test separately
###### random_state : fixed random number / test_size(default): 0.25

print(len(text_train), len(text_test))  
print(len(label_train), len(label_test))

stop_words = ['XXX', 'I', 'You', 'I', 'Only', 'Time', 'Visual']
def rm_stop(token):
    final = []
    for words in token:
        word_list = []
        for word in words:
            if word.split("/")[0] not in stop_words:
                word_list.append(word)
        final.append(word_list)
    return final
text_train = rm_stop(text_train)
text_test = rm_stop(text_test)

from gensim.models import Word2Vec
##### ##### Skip-gram (sg=1)
w2v_skip_gram = Word2Vec(text_train, size=100, window=10, min_count=10, workers=4, sg=1)
##### ##### CBOW (sg=0)
w2v_CBOW = Word2Vec(text_train, size=100, window=10, min_count=10, workers=4, sg=0)
### Do Text Embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec

    def transform(self, X):
        tfidf = TfidfVectorizer(analyzer = lambda x : x) 
        tfidf.fit(X)
        max_idf = max(tfidf.idf_) 
        word2weight = defaultdict(lambda : max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()]) 

        array_list =[]
        for words in X:
            array_list.append(np.array(np.mean([self.word2vec[w]*word2weight[w] for w in words if w in self.word2vec] or [np.zeros(100)], axis = 0)))
        return(array_list)

vec_tf_skip_gram = TfidfEmbeddingVectorizer(w2v_skip_gram)
vec_tf_CBOW = TfidfEmbeddingVectorizer(w2v_CBOW)

#### skip-gram
train_tf_s = vec_tf_skip_gram.transform(text_train)
test_tf_s = vec_tf_skip_gram.transform(text_test)
#### #### CBOW
train_tf_c = vec_tf_CBOW.transform(text_train)
test_tf_c = vec_tf_CBOW.transform(text_test)

from sklearn.svm import SVC
clf = SVC (decision_function_shape='ovo') # Create SVM
svc_clf_s = clf.fit(train_tf_s, label_train)  # skip-gram
svc_clf_c = clf.fit(train_tf_c, label_train)  # CBOW

svc_pred_s = svc_clf_s.predict(test_tf_s) # skip-gram
svc_pred_c = svc_clf_c.predict(test_tf_c) # CBOW

from sklearn import metrics
print(metrics.classification_report(label_test, svc_pred_s)) # skip-gram

print(metrics.classification_report(label_test, svc_pred_c)) # CBOW

from sklearn import neighbors, datasets
clf1 = neighbors.KNeighborsClassifier()
knn_clf = clf1.fit(train_tf_s, label_train)
knn_pred = knn_clf.predict(test_tf_s)

print(metrics.classification_report(label_test, knn_pred))

from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier()
RF_clf = clf2.fit(train_tf_c, label_train)
RF_pred = RF_clf.predict(test_tf_c)
RF_pred
print(metrics.classification_report(label_test, RF_pred))
RF_clf.score(test_tf_s, label_test)

public_test = pd.read_csv(r'C:\Users\chlwn\Desktop\JUNO\data\smishing data\public_test.csv')
public_data = list(zip(public_test['text']))
public_text = [data[i][0] for i in range(len(data))]
import re
public_text1 = [re.sub('\d+',' ',tmp) for tmp in public_text]
public_text2 = [re.sub('\W+',' ',tmp) for tmp in public_text1]
public_text_split = [tmp.split(' ') for tmp in public_text2]
from sklearn.model_selection import train_test_split
public_text = rm_stop(public_text)
print(public_text_split)
# # skip-gram
public_test_tf_s = vec_tf_skip_gram.transform(public_text_split)
# # CBOW
public_test_tf_c = vec_tf_CBOW.transform(public_text_split)
print(RF_clf.predict_proba(public_test_tf_c))
S = list(zip(test_data[:], RF_clf.predict(public_test_tf_s)))
print(S)
df3 = pd.DataFrame(S)
df3.to_csv(r'C:\Users\chlwn\Desktop\JUNO\data\smishing data\public_test_RF_4.csv',encoding='UTF-8')

*If you extract it like this and make it into a csv file, the smishing status is all zero, but I'm asking because I think it's a wrong value. **

python3.6.1 deep-learning

2022-09-22 18:47

1 Answers

Please refer to the colab notebook.

To do this on the pc, you can download the word2vec modeling result file and the classifier file learned by Random forest and perform the colab notebook code.

Learned data

https://drive.google.com/open?id=1nCJG5jBbgn2TvrfvHcnKu8XcNGdm5neu

Word2vec modeling result file

https://drive.google.com/open?id=1FWNLMbXqegYPuG14RQOKBBsuAIF8Orb6

Classifier files learned with Random forest

https://drive.google.com/open?id=1-1UwGjgB3ZbALgcv4tK-UpqG2mh8lgg9

colab notebook

https://colab.research.google.com/drive/1_eV4_y2WjrrEnlt-Aiv_vPLMMF37oZk3#scrollTo=dTc03PKYDQDo

2022-09-22 18:47