- |-doc2vec_Chinese //新建文件夹
- |- doc2vec.py //新建python
- |- data
- |- rawdata.txt
- |- model
-
- 資料集來源:https://github.com/draguitar/doc2vec_Chinese/blob/master/data/rawdata.txt
import 模組
設定停止詞
- # %%
- import os
- # 專案路徑
- os.chdir("D:/python/doc2vec")
-
- import jieba
- import sys
- import gensim
- import sklearn
- import numpy as np
- from gensim.models.doc2vec import Doc2Vec, LabeledSentence #從gensim導入doc2vec
- TaggededDocument = gensim.models.doc2vec.TaggedDocument
- #手動將'瓔珞'加入自定義userdict.txt中
- jieba.load_userdict("./jieba/userdict.txt")
-
- # 停止詞
- stoplist = ['的','了','被','。',',','、','她','自己','他','並','和','都','去','\n']
- # %%
中文結巴斷詞
- # %%
- #中文分詞
- def cut_files():
- filePath = 'data/rawdata.txt'
- fr = open(filePath, 'r', encoding="utf-8")
- fvideo = open('data/rawdata_jieba.txt', "w", encoding="utf-8")
-
- for line in fr.readlines():
- curLine =' '.join(list(jieba.cut(line)))
- fvideo.writelines(curLine)
- # %%
將文本轉成>>>TaggedDocument,
- # %%
- def get_datasest():
- with open("data/rawdata_jieba.txt", 'r', encoding="utf-8") as cf:
- docs = cf.readlines()
-
- # 删除stopword
- for idx in list(range(0,len(docs))):
- docs[idx] = ' '.join([word for word in docs[idx].split( ) if word not in stoplist])
- docs = [doc for doc in docs if len(doc)>0]
- print(len(docs))
-
- x_train = []
- for i, text in enumerate(docs):
- word_list = text.split(' ')
- l = len(word_list)
- word_list[l - 1] = word_list[l - 1].strip()
- document = TaggededDocument(word_list, tags=[i])
- x_train.append(document)
-
- return x_train
- # %%
- # %%
- #訓練模型
- def train(x_train, size=200, epoch_num=1): # size=200 200維
- # 使用 Doc2Vec 建模
- model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
- #model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
- model_dm.save('model/model_dm_doc2vec')
-
- return model_dm
- # %%
- # %%
- def test():
- # model_dm = Doc2Vec.load("model/model_dm_doc2vec")
- test_text = ['我', '喜歡', '傅恆']
- inferred_vector_dm = model_dm.infer_vector(test_text)
-
- # 相似度前10
- sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
- return sims
- # %%
- # %%
- if __name__ == '__main__':
- cut_files()
- x_train=get_datasest()
- model_dm = train(x_train)
- sims = test()
- for count, sim in sims:
- sentence = x_train[count]
- words = ''
- for word in sentence[0]:
- words = words + word + ' '
- print (words, sim, len(sentence[0]))
- # %%
- # 相似度
- print(model_dm.similarity('瓔珞', '皇后'))
- print(model_dm.similarity('瓔珞', '皇上'))
- #print(model_dm.wv.vocab)
- # %%
- '''
- 官方文件的基本範例
- https://radimrehurek.com/gensim/models/doc2vec.html
- '''
- from gensim.test.utils import common_texts
- from gensim.models.doc2vec import Doc2Vec, TaggedDocument
- documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
- print(documents)
- # %%