2020年1月12日 星期日

[python]doc2vec教學 step-by-step

  1. |-doc2vec_Chinese //新建文件夹
  2. |- doc2vec.py //新建python
  3. |- data
  4. |- rawdata.txt
  5. |- model
  6.  
  7. 資料集來源:https://github.com/draguitar/doc2vec_Chinese/blob/master/data/rawdata.txt
import 模組
設定停止詞

  1. # %%
  2. import os
  3. # 專案路徑
  4. os.chdir("D:/python/doc2vec")
  5.  
  6. import jieba
  7. import sys
  8. import gensim
  9. import sklearn
  10. import numpy as np
  11. from gensim.models.doc2vec import Doc2Vec, LabeledSentence #從gensim導入doc2vec
  12. TaggededDocument = gensim.models.doc2vec.TaggedDocument
  13. #手動將'瓔珞'加入自定義userdict.txt中
  14. jieba.load_userdict("./jieba/userdict.txt")
  15.  
  16. # 停止詞
  17. stoplist = ['的','了','被','。',',','、','她','自己','他','並','和','都','去','\n']
  18. # %%
中文結巴斷詞
  1. # %%
  2. #中文分詞
  3. def cut_files():
  4. filePath = 'data/rawdata.txt'
  5. fr = open(filePath, 'r', encoding="utf-8")
  6. fvideo = open('data/rawdata_jieba.txt', "w", encoding="utf-8")
  7.  
  8. for line in fr.readlines():
  9. curLine =' '.join(list(jieba.cut(line)))
  10. fvideo.writelines(curLine)
  11. # %%
將文本轉成>>>TaggedDocument,
  1. # %%
  2. def get_datasest():
  3. with open("data/rawdata_jieba.txt", 'r', encoding="utf-8") as cf:
  4. docs = cf.readlines()
  5. # 删除stopword
  6. for idx in list(range(0,len(docs))):
  7. docs[idx] = ' '.join([word for word in docs[idx].split( ) if word not in stoplist])
  8. docs = [doc for doc in docs if len(doc)>0]
  9. print(len(docs))
  10.  
  11. x_train = []
  12. for i, text in enumerate(docs):
  13. word_list = text.split(' ')
  14. l = len(word_list)
  15. word_list[l - 1] = word_list[l - 1].strip()
  16. document = TaggededDocument(word_list, tags=[i])
  17. x_train.append(document)
  18.  
  19. return x_train
  20. # %%
  1. # %%
  2. #訓練模型
  3. def train(x_train, size=200, epoch_num=1): # size=200 200維
  4. # 使用 Doc2Vec 建模
  5. model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
  6. #model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
  7. model_dm.save('model/model_dm_doc2vec')
  8.  
  9. return model_dm
  10. # %%
  1. # %%
  2. def test():
  3. # model_dm = Doc2Vec.load("model/model_dm_doc2vec")
  4. test_text = ['我', '喜歡', '傅恆']
  5. inferred_vector_dm = model_dm.infer_vector(test_text)
  6. # 相似度前10
  7. sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
  8. return sims
  9. # %%
  1. # %%
  2. if __name__ == '__main__':
  3. cut_files()
  4. x_train=get_datasest()
  5. model_dm = train(x_train)
  6. sims = test()
  7. for count, sim in sims:
  8. sentence = x_train[count]
  9. words = ''
  10. for word in sentence[0]:
  11. words = words + word + ' '
  12. print (words, sim, len(sentence[0]))
  13. # %%
  14. # 相似度
  15. print(model_dm.similarity('瓔珞', '皇后'))
  16. print(model_dm.similarity('瓔珞', '皇上'))
  17. #print(model_dm.wv.vocab)
  18. # %%
  19. '''
  20. 官方文件的基本範例
  21. https://radimrehurek.com/gensim/models/doc2vec.html
  22. '''
  23. from gensim.test.utils import common_texts
  24. from gensim.models.doc2vec import Doc2Vec, TaggedDocument
  25. documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
  26. print(documents)
  27. # %%