|-doc2vec_Chinese //新建文件夹
|- doc2vec.py //新建python
|- data
|- rawdata.txt
|- model
資料集來源:https://github.com/draguitar/doc2vec_Chinese/blob/master/data/rawdata.txt
import 模組
設定停止詞
# %%
import os
# 專案路徑
os.chdir("D:/python/doc2vec")
import jieba
import sys
import gensim
import sklearn
import numpy as np
from gensim.models.doc2vec import Doc2Vec, LabeledSentence #從gensim導入doc2vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
#手動將'瓔珞'加入自定義userdict.txt中
jieba.load_userdict("./jieba/userdict.txt")
# 停止詞
stoplist = ['的','了','被','。',',','、','她','自己','他','並','和','都','去','\n']
# %%
中文結巴斷詞
# %%
#中文分詞
def cut_files():
filePath = 'data/rawdata.txt'
fr = open(filePath, 'r', encoding="utf-8")
fvideo = open('data/rawdata_jieba.txt', "w", encoding="utf-8")
for line in fr.readlines():
curLine =' '.join(list(jieba.cut(line)))
fvideo.writelines(curLine)
# %%
將文本轉成>>>TaggedDocument,
# %%
def get_datasest():
with open("data/rawdata_jieba.txt", 'r', encoding="utf-8") as cf:
docs = cf.readlines()
# 删除stopword
for idx in list(range(0,len(docs))):
docs[idx] = ' '.join([word for word in docs[idx].split( ) if word not in stoplist])
docs = [doc for doc in docs if len(doc)>0]
print(len(docs))
x_train = []
for i, text in enumerate(docs):
word_list = text.split(' ')
l = len(word_list)
word_list[l - 1] = word_list[l - 1].strip()
document = TaggededDocument(word_list, tags=[i])
x_train.append(document)
return x_train
# %%
# %%
#訓練模型
def train(x_train, size=200, epoch_num=1): # size=200 200維
# 使用 Doc2Vec 建模
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
#model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
model_dm.save('model/model_dm_doc2vec')
return model_dm
# %%
# %%
def test():
# model_dm = Doc2Vec.load("model/model_dm_doc2vec")
test_text = ['我', '喜歡', '傅恆']
inferred_vector_dm = model_dm.infer_vector(test_text)
# 相似度前10
sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
return sims
# %%
# %%
if __name__ == '__main__':
cut_files()
x_train=get_datasest()
model_dm = train(x_train)
sims = test()
for count, sim in sims:
sentence = x_train[count]
words = ''
for word in sentence[0]:
words = words + word + ' '
print (words, sim, len(sentence[0]))
# %%
# 相似度
print(model_dm.similarity('瓔珞', '皇后'))
print(model_dm.similarity('瓔珞', '皇上'))
#print(model_dm.wv.vocab)
# %%
'''
官方文件的基本範例
https://radimrehurek.com/gensim/models/doc2vec.html
'''
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
print(documents)
# %%