2020年1月12日 星期日

[python]doc2vec教學 step-by-step

|-doc2vec_Chinese //新建文件夹
    |- doc2vec.py //新建python
    |- data 
        |- rawdata.txt 
    |- model 

資料集來源:https://github.com/draguitar/doc2vec_Chinese/blob/master/data/rawdata.txt
import 模組
設定停止詞

# %%
import os
 # 專案路徑
os.chdir("D:/python/doc2vec")

import jieba
import sys
import gensim
import sklearn
import numpy as np
from gensim.models.doc2vec import Doc2Vec, LabeledSentence #從gensim導入doc2vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
#手動將'瓔珞'加入自定義userdict.txt中
jieba.load_userdict("./jieba/userdict.txt")

# 停止詞
stoplist = ['的','了','被','。',',','、','她','自己','他','並','和','都','去','\n']
# %%
中文結巴斷詞
# %%
#中文分詞
def  cut_files():
    filePath = 'data/rawdata.txt'
    fr = open(filePath, 'r', encoding="utf-8")
    fvideo = open('data/rawdata_jieba.txt', "w", encoding="utf-8")

    for line in fr.readlines():
        curLine =' '.join(list(jieba.cut(line)))
        fvideo.writelines(curLine)
# %%
將文本轉成>>>TaggedDocument,
# %%
def get_datasest():
    with open("data/rawdata_jieba.txt", 'r', encoding="utf-8") as cf:
        docs = cf.readlines()
        
        # 删除stopword
        for idx in list(range(0,len(docs))):
            docs[idx] = ' '.join([word for word in docs[idx].split( ) if word not in stoplist])
        docs = [doc for doc in docs if len(doc)>0]
        print(len(docs))

    x_train = []
    for i, text in enumerate(docs):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)

    return x_train
# %%
# %%
#訓練模型
def train(x_train, size=200, epoch_num=1):  # size=200 200維
 # 使用 Doc2Vec 建模
    model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
    #model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
    model_dm.save('model/model_dm_doc2vec')

    return model_dm
# %%
# %%
def test():
#    model_dm = Doc2Vec.load("model/model_dm_doc2vec")
    test_text = ['我', '喜歡', '傅恆']
    inferred_vector_dm = model_dm.infer_vector(test_text)
    
    # 相似度前10
    sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
    return sims
# %%
# %%
if __name__ == '__main__':
    cut_files()
    x_train=get_datasest()
    model_dm = train(x_train)
    sims = test()
    for count, sim in sims:
        sentence = x_train[count]
        words = ''
        for word in sentence[0]:
            words = words + word + ' '
        print (words, sim, len(sentence[0]))
# %%  
        # 相似度
        print(model_dm.similarity('瓔珞', '皇后'))
        print(model_dm.similarity('瓔珞', '皇上'))
#print(model_dm.wv.vocab)
# %%
'''
官方文件的基本範例
https://radimrehurek.com/gensim/models/doc2vec.html
'''
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
print(documents)
# %%