import numpy as np
import pandas as pd

from konlpy.tag import Mecab

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery

%%bigquery df
#standardSQL
SELECT * FROM `project.dataset.tabe_story`

## 정상데이터 label 매기기
df['label'] = 1

df.shape

(938, 7)

## 정상 데이터
posit_df = df[['contents','label']]
## 불건전 데이터
negat_df = pd.read_csv('unhealth_data.csv',index_col=0)

posit_df.shape

(938, 2)

posit_df = posit_df.dropna(axis=0)

posit_df.shape

(742, 2)

posit_df.head()

negat_df.head()

negat_df.shape

(362, 2)

data = pd.concat([posit_df,negat_df], axis = 0)
print(data.shape)
data.head()

(1104, 2)

data.to_csv('btsws_label_data.csv',encoding='utf-8')

1. nouns, morphs 데이#53552;별 contents 재구성¶

def noun_parsing(contents):
    mecab = Mecab()
    noun_data = ' '.join(mecab.nouns(contents))
    return noun_data

def morphs_parsing(contents):
    mecab = Mecab()
    noun_data = ' '.join(mecab.morphs(contents))
    return noun_data

data['contents'] = data['contents'].astype(str)
data['noun_contents'] = data['contents'].apply(noun_parsing)
data['morphs_contents'] = data['contents'].apply(morphs_parsing)

data.morphs_contents.head()

1    hello , hi , we find to people here , hello , ...
2                                             asdasdad
3                                     dasfsdfasdfsdfas
4                                        ㄹ ㄹ ㄹ ㄹ ㄹ ㄹ ㄹ
5                                                 dddd
Name: morphs_contents, dtype: object

2. 학습, 테스트 데이터 분리¶

X = data.morphs_contents
# X = data.contents
y = data.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

3. Vectorizer : n_gram vectorize (n-gram : 1~2)¶

tfidv = TfidfVectorizer(encoding='utf-8',ngram_range=(1,2)).fit(X_train)

X_train = tfidv.transform(X_train).toarray()
X_test = tfidv.transform(X_test).toarray()

X_train.shape

(772, 173586)

y = list(df.label.values)

4. 모델학습¶

### NB
model = BernoulliNB()
model.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

### LR 
clf = LogisticRegression(random_state=42).fit(X_train, y_train)

/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)

5. 모델 평가¶

NB¶

y_pred = model.predict(X_test)

target_names = ['부정', '긍정']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          부정       1.00      0.28      0.44        99
          긍정       0.77      1.00      0.87       233

   micro avg       0.79      0.79      0.79       332
   macro avg       0.88      0.64      0.65       332
weighted avg       0.84      0.79      0.74       332

confusion_matrix(y_test, y_pred)

array([[ 28,  71],
       [  0, 233]])

LR¶

y_pred = clf.predict(X_test)

target_names = ['부정', '긍정']
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

          부정       0.99      0.95      0.97        99
          긍정       0.98      1.00      0.99       233

   micro avg       0.98      0.98      0.98       332
   macro avg       0.98      0.97      0.98       332
weighted avg       0.98      0.98      0.98       332

confusion_matrix(y_test, y_pred)

array([[ 94,   5],
       [  1, 232]])

LR 해석¶

importance_dic = { ind:v for ind,v in enumerate(clf.coef_[0])}

# value의 절대값을 기준으로 내림차순 정렬 
importance_dic_reverse = sorted(importance_dic.items(), 
                              reverse=True, 
                              key=lambda item: abs(item[1]))

# coefficient가 높은 상위 5개 feature 추출
importance_feature_lst = importance_dic_reverse[:5]

features_name = tfidv.get_feature_names()

for i,_ in importance_feature_lst:
    print(features_name[i])

소리
신음
목소리
새끼
섹스

6. 예측¶

test_text = '벌써 일주일이나 지났다.'
X_pred = tfidv.transform([test_text])
y_pred = clf.predict(X_pred)

y_pred

array([1])

sequenceVec -> Embedding layer -> RNN¶

import matplotlib.pyplot as plt

from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data = pd.read_csv('btsws_label_data.csv',encoding='utf-8',index_col=0)
data['contents'] = data['contents'].astype(str)
data['noun_contents'] = data['contents'].apply(noun_parsing)
data['morphs_contents'] = data['contents'].apply(morphs_parsing)

X = data.morphs_contents
# X = data.contents
y = data.label

## X,y
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

sequences = tokenizer.texts_to_sequences(X)  ### vovab을 만들고 각 단어마다 고유번호를 매긴 후 번호를 순서에 따라 매핑하는 방법

X_data = sequences

word_to_index = tokenizer.word_index

vocab_size = len(word_to_index) + 1 ##  padding 할때 필요한 0 index 추가
print(vocab_size)

26712

X_data = sequences
print('문장 최대 길이 : %d' % max(len(l) for l in X_data))
print('문장 평균 길이 : %f' % (sum(map(len, X_data))/len(X_data)))
plt.hist([len(s) for s in X_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

문장 최대 길이 : 16164
문장 평균 길이 : 650.883152

max_len = max(len(l) for l in X_data)
# 전체 데이터셋의 길이는 max_len으로 맞춥니다.
data = pad_sequences(X_data, maxlen = max_len, padding='pre')  ### 비어있는 사이즈를 0으로 채워주는 것, padding은 앞에 0을 채울지 뒤에 채울지 설정할 수 있음
print("훈련 데이터의 크기(shape): ", data.shape)

훈련 데이터의 크기(shape):  (1104, 16164)

len(data)

1104

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42)

### RNN
model = Sequential()
model.add(Embedding(vocab_size, 32)) # 임베딩 벡터의 차원은 32
model.add(SimpleRNN(32)) # RNN 셀의 hidden_size는 32
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_split=0.2)

Train on 617 samples, validate on 155 samples
Epoch 1/15
617/617 [==============================] - 31s 51ms/sample - loss: 0.5930 - acc: 0.6856 - val_loss: 0.6347 - val_acc: 0.6516
Epoch 2/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.5463 - acc: 0.7083 - val_loss: 0.4959 - val_acc: 0.8710
Epoch 3/15
617/617 [==============================] - 31s 51ms/sample - loss: 0.3284 - acc: 0.9498 - val_loss: 0.3032 - val_acc: 0.9419
Epoch 4/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.2321 - acc: 0.9530 - val_loss: 0.3345 - val_acc: 0.9355
Epoch 5/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.2447 - acc: 0.9271 - val_loss: 0.2834 - val_acc: 0.9226
Epoch 6/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.1334 - acc: 0.9838 - val_loss: 0.2148 - val_acc: 0.9484
Epoch 7/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.1139 - acc: 0.9822 - val_loss: 0.4834 - val_acc: 0.7742
Epoch 8/15
617/617 [==============================] - 31s 51ms/sample - loss: 0.1000 - acc: 0.9822 - val_loss: 0.2078 - val_acc: 0.9419
Epoch 9/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.0761 - acc: 0.9903 - val_loss: 0.2016 - val_acc: 0.9484
Epoch 10/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.0693 - acc: 0.9903 - val_loss: 0.3131 - val_acc: 0.8645
Epoch 11/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.0834 - acc: 0.9903 - val_loss: 0.2410 - val_acc: 0.9097
Epoch 12/15
617/617 [==============================] - 31s 51ms/sample - loss: 0.0551 - acc: 0.9919 - val_loss: 0.1853 - val_acc: 0.9548
Epoch 13/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.0522 - acc: 0.9903 - val_loss: 0.2012 - val_acc: 0.9419
Epoch 14/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.0452 - acc: 0.9919 - val_loss: 0.2065 - val_acc: 0.9419
Epoch 15/15
617/617 [==============================] - 31s 50ms/sample - loss: 0.0432 - acc: 0.9919 - val_loss: 0.3598 - val_acc: 0.8258

결과 시각화¶

import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

y_pred = model.predict(X_test)

target_names = ['부정', '긍정']
print(classification_report(y_test, y_pred.round(), target_names=target_names))

              precision    recall  f1-score   support

          부정       0.82      0.90      0.86        99
          긍정       0.96      0.92      0.94       233

   micro avg       0.91      0.91      0.91       332
   macro avg       0.89      0.91      0.90       332
weighted avg       0.92      0.91      0.91       332

confusion_matrix(y_test, y_pred.round())

array([[ 89,  10],
       [ 19, 214]])

	contents	label
0	박지민,전정국 이 둘의 은밀한 이야기가 지금 시작된다.,꽤 오래 꽂혀있었던지라 뽁-...	0
0	조금은 애절해서 물기가 가득한 정국의 목소리가 공허한 방에 한가득 울려퍼졌다.	0
0	여전히 곤히 잠자고 있는 정국의 입술 근처 까지 얼굴을 들이댔다가 들려오는 정국의 ...	0
0	정국은 자신의 티 안으로 들어오는 차가운 손에 고개를 뒤로 꺾으며 소리를 냈다.,뒤...	0
0	그 방법은 스킨십. 포옹, 키스, 섹스. 센티넬과 가이드는 처음 만나면 각인이라는 ...	0

soynlp 한국어 형태소 분석기(학습형 형태소 분리기) (0)	2020.06.08
[Text preprocessing] 문장 형태소별 토큰화 및 벡터화 (0)	2020.06.03
[Text preprocessing] 텍스트 데이터의 encoding 형식을 알아내기 (0)	2020.05.28
[Text preprocessing] 한국어 문장 splitter (0)	2020.05.27
[Text preprocessing] Lemmatization and Stemming (0)	2020.03.24

데이터과학 삼학년

데이터과학 삼학년

BTS 불건전 팬픽 분류 분석 (Naive Bayes, Logistic Regression, RNN) 본문

BTS 불건전 팬픽 분류 분석 (Naive Bayes, Logistic Regression, RNN)

1. nouns, morphs 데이#53552;별 contents 재구성¶

2. 학습, 테스트 데이터 분리¶

3. Vectorizer : n_gram vectorize (n-gram : 1~2)¶

4. 모델학습¶

5. 모델 평가¶

NB¶

LR¶

LR 해석¶

6. 예측¶

sequenceVec -> Embedding layer -> RNN¶

결과 시각화¶

'Natural Language Processing' 카테고리의 다른 글

티스토리툴바

	contents	label
1	hello,hi,we find to people here,hello,my dog j...	1
2	asdasdad	1
3	dasfsdfasdfsdfas	1
4	ㄹㄹㄹㄹㄹㄹㄹ	1
5	dddd	1

« 2024/05 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

데이터과학 삼학년

BTS 불건전 팬픽 분류 분석 (Naive Bayes, Logistic Regression, RNN) 본문

BTS 불건전 팬픽 분류 분석 (Naive Bayes, Logistic Regression, RNN)

1. nouns, morphs 데이#53552;별 contents 재구성¶

2. 학습, 테스트 데이터 분리¶

3. Vectorizer : n_gram vectorize (n-gram : 1~2)¶

4. 모델학습¶

5. 모델 평가¶

NB¶

LR¶

LR 해석¶

6. 예측¶

sequenceVec -> Embedding layer -> RNN¶

결과 시각화¶

'Natural Language Processing' 카테고리의 다른 글

티스토리툴바

1. nouns, morphs 데이#53552;별 contents 재구성¶