250x250
반응형
Notice
Recent Posts
Recent Comments
Link
일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 30 | 31 |
Tags
- spark udf
- login crawling
- integrated gradient
- 상관관계
- Retry
- session 유지
- API
- 공분산
- GenericGBQException
- Counterfactual Explanations
- chatGPT
- UDF
- correlation
- flask
- gather_nd
- API Gateway
- youtube data
- airflow subdag
- XAI
- Airflow
- grad-cam
- subdag
- GCP
- requests
- BigQuery
- 유튜브 API
- top_k
- tensorflow text
- TensorFlow
- hadoop
Archives
- Today
- Total
데이터과학 삼학년
tf.keras (2.0) & soynlp를 이용한 텍스트 분류 (DNN, RNN, CNN) 본문
Natural Language Processing
tf.keras (2.0) & soynlp를 이용한 텍스트 분류 (DNN, RNN, CNN)
Dan-k 2020. 6. 12. 11:42반응형
형태소 분해를 위해 soynlp를 이용하고,
분류문제를 풀기 위해 tf.keras를 이용하여 이진분류를 한다.
모델은 DNN, RNN, CNN을 간단하게 적용한다.
import numpy as np
import pandas as pd
from soynlp.tokenizer import MaxScoreTokenizer
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer
import os
import shutil
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.layers import (
Embedding,
Flatten,
GRU,
Conv1D,
Lambda,
Dense,
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
print(tf.__version__)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
LOGDIR = "./text_models"
1.Preprocess
data = pd.read_csv('label_data.csv',encoding='utf-8',index_col=0)
data.head()
data['contents'] = data['contents'].apply(str)
corpus = data.contents.apply(str)
word_extractor = WordExtractor()
word_extractor.train(corpus)
word_score = word_extractor.extract()
scores = {word:score.cohesion_forward for word, score in word_score.items()}
maxscore_tokenizer = MaxScoreTokenizer(scores=scores)
def soynlp_morphs(contents):
return ' '.join(maxscore_tokenizer.tokenize(contents))
%%time
data['soynlp_morphs_contents'] = data['contents'].apply(soynlp_morphs)
X = data.soynlp_morphs_contents
y = data.label
## X,y
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X) ### vovab을 만들고 각 단어마다 고유번호를 매긴 후 번호를 순서에 따라 매핑하는 방법
word_to_index = tokenizer.word_index
VOCAB_SIZE = len(word_to_index) + 1 ## padding 할때 필요한 0 index 추가
MAX_LEN = max(len(seq) for seq in sequences)
MAX_LEN
def encode_labels(sources):
classes = [source for source in sources]
one_hots = to_categorical(classes)
return one_hots
def create_sequences(texts, max_len=MAX_LEN):
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, max_len, padding='post')
return padded_sequences
X_train, X_test, y_train, y_test = train_test_split(create_sequences(X), encode_labels(y), test_size=0.3, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
N_CLASSES =2
2.Train
DNN
def build_dnn_model(embed_dim):
model = Sequential([
Embedding(VOCAB_SIZE + 1, embed_dim, input_shape=[MAX_LEN]),
Lambda(lambda x: tf.reduce_mean(x, axis=1)),
Dense(100, activation='relu'),
Dense(100, activation='relu'),
Dense(N_CLASSES, activation='softmax') ## activation=tf.nn.softmax
])
model.compile(
optimizer=tf.keras.optimizers.Adam(
learning_rate=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
name='Adam'
),
loss='categorical_crossentropy',
metrics=['accuracy']
)
return model
%%time
tf.random.set_seed(33) # tf 2.0에서 적용
MODEL_DIR = os.path.join(LOGDIR, 'dnn')
shutil.rmtree(MODEL_DIR, ignore_errors=True)
BATCH_SIZE = 300
EPOCHS = 50
EMBED_DIM = 100
PATIENCE = 0
dnn_model = build_dnn_model(embed_dim=EMBED_DIM)
dnn_history = dnn_model.fit(
X_train, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=(X_valid, y_valid),
callbacks=[EarlyStopping(patience=PATIENCE), TensorBoard(MODEL_DIR)],
)
## tf 2.0 : 'accuracy', 'val_accuracy','loss', 'val_loss'
pd.DataFrame(dnn_history.history)[['loss', 'val_loss']].plot()
pd.DataFrame(dnn_history.history)[['accuracy', 'val_accuracy']].plot()
dnn_model.summary()
dnn_history.history.keys()
RNN
def build_rnn_model(embed_dim, units):
model = Sequential([
Embedding(VOCAB_SIZE + 1, embed_dim, input_shape=[MAX_LEN], mask_zero=True),
GRU(units),
Dense(N_CLASSES, activation='softmax')
])
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
return model
%%time
tf.random.set_seed(33)
MODEL_DIR = os.path.join(LOGDIR, 'rnn')
shutil.rmtree(MODEL_DIR, ignore_errors=True)
EPOCHS = 15
BATCH_SIZE = 300
EMBED_DIM = 100
UNITS = 16
PATIENCE = 0
rnn_model = build_rnn_model(embed_dim=EMBED_DIM, units=UNITS)
history = rnn_model.fit(
X_train, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=(X_valid, y_valid),
callbacks=[EarlyStopping(patience=PATIENCE), TensorBoard(MODEL_DIR)],
use_multiprocessing=True ## OOM error out of memory error가 발생할때 True로 변경하여 준다
)
pd.DataFrame(history.history)[['loss', 'val_loss']].plot()
pd.DataFrame(history.history)[['accuracy', 'val_accuracy']].plot()
rnn_model.summary()
CNN
def build_cnn_model(embed_dim, filters, ksize, strides):
model = Sequential([
Embedding(
VOCAB_SIZE + 1,
embed_dim,
input_shape=[MAX_LEN],
mask_zero=True),
Conv1D(
filters=filters,
kernel_size=ksize,
strides=strides,
activation='relu',
),
Flatten(),
Dense(N_CLASSES, activation='softmax')
])
model.compile(
optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy']
)
return model
%%time
tf.random.set_seed(33)
MODEL_DIR = os.path.join(LOGDIR, 'cnn')
shutil.rmtree(MODEL_DIR, ignore_errors=True)
EPOCHS = 50
BATCH_SIZE = 300
EMBED_DIM = 100
FILTERS = 200
STRIDES = 2
KSIZE = 3
PATIENCE = 0
cnn_model = build_cnn_model(
embed_dim=EMBED_DIM,
filters=FILTERS,
strides=STRIDES,
ksize=KSIZE,
)
cnn_history = cnn_model.fit(
X_train, y_train,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
validation_data=(X_valid, y_valid),
callbacks=[EarlyStopping(patience=PATIENCE), TensorBoard(MODEL_DIR)],
)
pd.DataFrame(cnn_history.history)[['loss', 'val_loss']].plot()
pd.DataFrame(cnn_history.history)[['accuracy', 'val_accuracy']].plot()
cnn_model.summary()
3.Test
def convert_argmax(array):
return np.argmax(array, axis=1)
DNN
dnn_model_pred = dnn_model.predict(X_test)
target_names = ['불건전', '건전']
print(confusion_matrix(convert_argmax(y_test), convert_argmax(dnn_model_pred)))
print(classification_report(convert_argmax(y_test), convert_argmax(dnn_model_pred), target_names=target_names))
RNN
rnn_model_pred = rnn_model.predict(X_test)
target_names = ['불건전', '건전']
print(confusion_matrix(convert_argmax(y_test), convert_argmax(rnn_model_pred)))
print(classification_report(convert_argmax(y_test), convert_argmax(rnn_model_pred), target_names=target_names))
CNN
cnn_model_pred = cnn_model.predict(X_test)
target_names = ['불건전', '건전']
print(confusion_matrix(convert_argmax(y_test), convert_argmax(cnn_model_pred)))
print(classification_report(convert_argmax(y_test), convert_argmax(cnn_model_pred), target_names=target_names))
4.결과
-
불건전 컨텐츠를 분류하는 recall 비교에서 CNN > RNN >DNN 순으로 성능이 좋게 나왔으며, DNN의 경우, 거의 분류를 못하는 것으로 보여짐
-
학습시간 DNN 28초, RNN 1시간 18초, CNN 2분 41초 로 DNN < CNN < RNN 순으로 학습시간이 걸림
-
종합결과 하이퍼파라미터 튜닝이나 양질의 데이터를 이용하여 다시 실험할 경우, 결과가 다를 수 있겠으나, CNN이 가장 적합한 것으로 판단된다.
728x90
반응형
LIST
'Natural Language Processing' 카테고리의 다른 글
bi-directional 어텐션 메카니즘 vs bi-directional 모델 (네이버 영화리뷰) (0) | 2020.06.23 |
---|---|
Word Embedding (0) | 2020.06.17 |
soynlp 한국어 형태소 분석기(학습형 형태소 분리기) (0) | 2020.06.08 |
[Text preprocessing] 문장 형태소별 토큰화 및 벡터화 (0) | 2020.06.03 |
BTS 불건전 팬픽 분류 분석 (Naive Bayes, Logistic Regression, RNN) (0) | 2020.06.01 |
Comments