顯示具有 python 標籤的文章。 顯示所有文章
顯示具有 python 標籤的文章。 顯示所有文章

2020年1月12日 星期日

[python]doc2vec教學 step-by-step

|-doc2vec_Chinese //新建文件夹
    |- doc2vec.py //新建python
    |- data 
        |- rawdata.txt 
    |- model 

資料集來源:https://github.com/draguitar/doc2vec_Chinese/blob/master/data/rawdata.txt
import 模組
設定停止詞

# %%
import os
 # 專案路徑
os.chdir("D:/python/doc2vec")

import jieba
import sys
import gensim
import sklearn
import numpy as np
from gensim.models.doc2vec import Doc2Vec, LabeledSentence #從gensim導入doc2vec
TaggededDocument = gensim.models.doc2vec.TaggedDocument
#手動將'瓔珞'加入自定義userdict.txt中
jieba.load_userdict("./jieba/userdict.txt")

# 停止詞
stoplist = ['的','了','被','。',',','、','她','自己','他','並','和','都','去','\n']
# %%
中文結巴斷詞
# %%
#中文分詞
def  cut_files():
    filePath = 'data/rawdata.txt'
    fr = open(filePath, 'r', encoding="utf-8")
    fvideo = open('data/rawdata_jieba.txt', "w", encoding="utf-8")

    for line in fr.readlines():
        curLine =' '.join(list(jieba.cut(line)))
        fvideo.writelines(curLine)
# %%
將文本轉成>>>TaggedDocument,
# %%
def get_datasest():
    with open("data/rawdata_jieba.txt", 'r', encoding="utf-8") as cf:
        docs = cf.readlines()
        
        # 删除stopword
        for idx in list(range(0,len(docs))):
            docs[idx] = ' '.join([word for word in docs[idx].split( ) if word not in stoplist])
        docs = [doc for doc in docs if len(doc)>0]
        print(len(docs))

    x_train = []
    for i, text in enumerate(docs):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l - 1] = word_list[l - 1].strip()
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)

    return x_train
# %%
# %%
#訓練模型
def train(x_train, size=200, epoch_num=1):  # size=200 200維
 # 使用 Doc2Vec 建模
    model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
    #model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
    model_dm.save('model/model_dm_doc2vec')

    return model_dm
# %%
# %%
def test():
#    model_dm = Doc2Vec.load("model/model_dm_doc2vec")
    test_text = ['我', '喜歡', '傅恆']
    inferred_vector_dm = model_dm.infer_vector(test_text)
    
    # 相似度前10
    sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
    return sims
# %%
# %%
if __name__ == '__main__':
    cut_files()
    x_train=get_datasest()
    model_dm = train(x_train)
    sims = test()
    for count, sim in sims:
        sentence = x_train[count]
        words = ''
        for word in sentence[0]:
            words = words + word + ' '
        print (words, sim, len(sentence[0]))
# %%  
        # 相似度
        print(model_dm.similarity('瓔珞', '皇后'))
        print(model_dm.similarity('瓔珞', '皇上'))
#print(model_dm.wv.vocab)
# %%
'''
官方文件的基本範例
https://radimrehurek.com/gensim/models/doc2vec.html
'''
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
print(documents)
# %%

2019年7月22日 星期一

[Python]Colab讀取資料三種方法

本篇教學Colab,使用三種方法讓Colab讀取CSV檔 Colab是Google提供的免費平台,允許用戶使用Python進行編碼。 Colab本質上是Jupyter筆記本的Google Suite版本。 Colab優於Jupyter的一些優點包括更容易安裝包和共享文本。


github

點擊RAW,複製網址列網址,貼到程式碼中
url = '將網址貼到這'
df1 = pd.read_csv(url)
# Dataset is now stored in a Pandas Dataframe

上傳


from google.colab import files
# 上傳CSV
uploaded = files.upload()
import io
df2 = pd.read_csv(io.BytesIO(uploaded['你的檔案名稱.csv']))

Google Drive

import pandas as pd
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
出現提示時,點擊連結進行身分認證,允許訪問你的Google Drive,許可後將驗證碼貼製colab的驗證框中; 完成驗證後,到Google Drive中的CSV文件,右鍵選擇“取得檔案共用連結”。該連結將被複製到剪貼板中。將此連結貼到Colab中的link變數中。
link = 'https://drive.google.com/open?id=1GtzgAplaOjHEgXe2Rc-_pd1Dt_Wxl4Ya' 
# The shareable link
fluff, id = link.split('=')
print (id) 
# Verify that you have everything after '='
取得Dataframe
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
df3 = pd.read_csv('Filename.csv')
# Dataset is now stored in a Pandas Dataframe

2017年9月5日 星期二

[Python]高中生程式解題系統


#a003. 兩光法師占卜術
import sys
for line in sys.stdin:
 a,b = line.split()
 result = (int(a*2) + int(b))%3
 
 if result== 0 :
  print('普通')
 elif result== 1 :
  print('吉')
 else :
  print('大吉')
  

#a004. 文文的求婚  
import sys
for year in sys.stdin:
 if int(year)%4 == 0 and int(year)%100 != 0 :
  print('閏年')
 
 elif int(year)%400 == 0 :
  print('閏年')
  
 else :
  print('平年')
  
#a005. Eva 的回家作業
import sys
for input in sys.stdin:
 if len(input.split()) > 3 :
  a,b,c,d = input.split()
  
  if int(a)*int(d) == int(b)*int(c) :
   print('%s %s %s %s %d' % (a,b,c,d,int(d)/int(c)*int(d)))
  else :
   print('%s %s %s %s %d' % (a,b,c,d,int(d)-int(c)+int(d)))
   
   

#a009. 解碼器   
import sys

output = '' 
for input in sys.stdin:
 for char in input:
  if not char.isspace() :
   newchar = chr(int(ord(char))-7)
   output += newchar
  else :
   output += '\n'
print(output)

#a020: 身分證檢驗
import sys

output = '' 
city = [10,11,12,13,14,15,16,17,34,18,19,20,21,22,35,23,24,25,26,27,28,29,32,30,31,33]
i = 0

for input in sys.stdin:
    for char in input:
        if not char.isspace() :
            idx = ord(char)-ord('A') 
        else :
            break
        i+=1
        
        if i >0 :
            break
    newStr = str(city[idx]) + input
    newStr = newStr[:2] + newStr[3:]
    
    verify = int(newStr[0]) + int(newStr[10])
    for i in range(1,10,1):        
        verify += int(newStr[i])*int(10-i)
    
    if verify%10 == 0:
        print('real')
    else:
        print('fake')
  
#a034: 二進位制轉換
import sys
for input in sys.stdin:
 output = bin(int(input))
 output = output[2:] 
 
 print(output)

#a040: 平面圓形切割
import sys
for line in sys.stdin:
 n = int(line)
 print(n*n -n +2)
 

#a044: 空間切割
while True:
    try:
        plane  = int(input().strip('\r').strip('\n'))
        block = 2**plane
            
        
        print("{0}".format(block))
    except:
        break    
 

#a104: 排序
import sys
for sortNum in sys.stdin:
    output = ''
    output2 = ''
    if len(sortNum.split()) > 1:
        output = sorted(sortNum.split())

        i = 0
        for a in output:  
            if i==0:
                output2 += a
            else :
                output2 +=  ' ' + a
            i +=1 
        print(output2)
        
while 1:
    try:
        input()
        list1 = input().strip('\r').split(' ')
        list2 = [int(x) for x in list1]
        list2.sort()
        for y in list2:
            print(y,end=' ')
        print()
    except EOFError:
        break

    
#a147: Print it all   
import sys
for lim in sys.stdin:
    output = ''
    if int(lim) > 0:
        i = 0
        for a in range(1,int(lim),1):  
            if i==0:
                output += str(a) 
            elif a%7 != 0:
                output +=  ' ' + str(a)  
            i +=1 
        print(output)
        
#a149: 乘乘樂
while True:
    try:
        size = int(input().strip('\r').strip('\n'))
        for item in range(0, size , 1):
            numStr = input()
            list2 = [int(x) for x in numStr]
            num = 1
            for y in list2:
                num *= y
            print(num)
        
        #print("hello, {0}".format(line))
    except:
        break    
        
#a038: 數字翻轉       
while True:
    try:
        list1 = input().strip('\r').strip('\n')
        list2 = [int(x) for x in list1]
        list2.reverse()
        x = ''
        for y in list2:
            x += str(y)
        
        print(int(x))
    except EOFError:
        break


#a022: 迴文
while True:
    try:
        x=input().strip('\r').strip('\n')
        list1 = list(x)
        list1.reverse()
        if list(x)==list1:
            print('yes')
        else:
            print('no')
    except EOFError:
        break
        
#a148: You Cannot Pass?!
while True:
    try:
        score=input().strip('\r').strip('\n').split(' ')
        i = 0
        item = 0
        sumScore = 0
        for x in score:
            if i==0:
                item = int(x)
            else:
                sumScore += int(x)
            i+=1
         
        if sumScore/item>59:
            result = 'no'    
        else:
            result = 'yes'
        print(result)
    except EOFError:
        break   
        
#a738: 最大公约数
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
 
    return False

while True:
    try:
        list1 = input().strip('\r').split(' ')
        if len(list1) > 1 and is_number(list1[0].strip()) and is_number(list1[1].strip()):
            list2 = [int(x) for x in list1]
            list2.sort()
            
            a = int(list2[0])
            if a!=0 :
                b = int(list2[1])
                
                while b != 0:
                    t = a % b;
                    a = b
                    b = t
                print(a)
    except EOFError:
        break  
        
        
#a799: 正值國

while True:
    output = 0
    try:
        x = input().strip('\r')
        if int(x) < 0:
            output = -int(x)
        else:
            output = int(x)
        print(output)
    except EOFError:
        break    

#a065: 提款卡密碼
while True:
    output = ''
    try:
        x = input().strip('\r')
        x2 = list(x)
        
        y = [ord(x3) for x3 in x2]
        for i in range(0, len(y)-1):
            output += str(abs(y[i+1]-y[i]))
        print(output)
    except EOFError:
        break       

#d050: 妳那裡現在幾點了?        
while True:
    try:
        x = int(input().strip('\r'))
        if x>=15:
            print(x-15)
        else:
            print(x+24-15)
    except EOFError:
        break     

#d051: 糟糕,我發燒了!
while True:
    try:
        f = int(input().strip('\r'))
        c = (f-32)*5/9
        c = "%.3f" % c
        print(c)
    except EOFError:
        break     
        
#d058: BASIC 的 SGN 函數
while True:
    try:
        x = int(input().strip('\r'))
        if x > 0:
            print(1)
        elif x == 0:
            print(0)
        else:
            print(-1)
    except EOFError:
        break     

#d060: 還要等多久啊?    
while True:
    try:
        x = int(input().strip('\r'))
        if 25 >= x:
            print(25-x)
        else:
            print(25+60-x)
    except EOFError:
        break       


#d063: 0 與 1        
while True:
    try:
        x = int(input().strip('\r'))
        print(int(x==0))
    except EOFError:
        break  
        
#d064: 奇數?
while True:
    try:
        x = int(input().strip('\r'))
        if x%2==0:
            print('Even')
        else:
            print('Odd')
    except EOFError:
        break  
        
#d065: 三人行必有我師        
while True:
    try:
        list1 = input().strip('\r').split(' ')
        list2 = [int(x) for x in list1]
        print(max(list2))
    except EOFError:
        break  

#d066: 上學去吧!
from datetime import datetime as dt
while True:
    try:
        x = input().strip('\r')
        test = dt.strptime(x, "%H %M")
        s = dt.strptime("07 30", "%H %M")
        e = dt.strptime("17 00", "%H %M")
        if s <= test and test < e:
            print('At School')
        else:
            print('Off School')
    except EOFError:
        break    
        
#d068: 該減肥了!
while True:
    try:
        x = int(input())
        w = 0 
        if x > 50:
            w = int(x)-1
        else:
            w = x
        print(w)
    except EOFError:
        break  

#d074: 電腦教室
while True:
    try:
        x = input().strip('\r')
        list1 = input().strip('\r').split(' ')
        list2 = [int(x) for x in list1]
        print(max(list2))
    except EOFError:
        break  
        

#a263: 日期差幾天  
import datetime      
while True:
    try:
        list_day1 = input().strip('\r').split(' ')
        day1 = datetime.date(int(list_day1[0]),int(list_day1[1]),int(list_day1[2]))
        #設定要相減的日期
        list_day2 = input().strip('\r').split(' ')
        day2 = datetime.date(int(list_day2[0]),int(list_day2[1]),int(list_day2[2]))
        result = abs(day1 - day2)
        
        print (str(result.days))
    except EOFError:
        break  
        
#d124: 3的倍数
while True:
    try:
        i = input()
        if int(i) % 3 == 0:
            print('yes')
        else:
            print('no')
    except EOFError:
        break  

#d069: 文文的求婚--續集 (n 行版)
while True:
    try:
        num = int(input())
        for i in range(0,num,1):
            year = input().strip('\r')
            if int(year)%4 == 0 and int(year)%100 != 0 :
                print('a leap year')
            elif int(year)%400 == 0 :
                print('a leap year')
            else:
                print('a normal year')
    except EOFError:
        break  

#d070: 文文的求婚--續集 (0 尾版)
while True:
    try:
        year = int(input().strip('\r'))
        if year != 0:
            if int(year)%4 == 0 and int(year)%100 != 0 :
                print('a leap year')
            elif int(year)%400 == 0 :
                print('a leap year')
            else:
                print('a normal year')
        else:
            break
    except EOFError:
        break        
        
#d071: 文文的求婚--續集 (EOF 版)
while True:
    try:
        year = int(input().strip('\r'))
        if int(year)%4 == 0 and int(year)%100 != 0 :
            print('a leap year')
        elif int(year)%400 == 0 :
            print('a leap year')
        else:
            print('a normal year')
    except EOFError:
        break    

#d072: 文文的求婚--續集 (Case 版)
while True:
    try:
        num = int(input())
        for i in range(1,num+1,1):
            year = input().strip('\r')
            if int(year)%4 == 0 and int(year)%100 != 0 :
                print('Case '+i+': a leap year')
            elif int(year)%400 == 0 :
                print('Case '+i+': a leap year')
            else:
                print('Case '+i+': a normal year')
    except EOFError:
        break             
        
#d073: 分組報告
while True:
    try:
        team = 0
        num = int(input())
        if num % 3 == 0:
            team = int(num/3)
        else:
            team=int(num/3)+1
        print(team)
    except EOFError:
        break  
        
#d086: 態度之重要的證明
while True:
    try:
        inputStr = input().strip('\r').strip().lower()
        if inputStr != '0':
            inputList = list(inputStr)
            output = 0
            output2 = ''
            for score in inputList:
                if ord(score)-96<=26 and ord(score)-96>=1:
                    output += ord(score)-96
                else:
                    output2 = 'Fail'
                    
            if output2.strip()!='':
                print(output2)
            else:
                print(output)
    except EOFError:
        break 
        
#d827: 買鉛筆
while True:
    try:
        inputStr = input().strip('\r').strip()
        all = int(inputStr)
        output = divmod(all, 12)
        print(output[0] * 50 + output[1] * 5)
    except EOFError:
        break 
        

#d483: hello, world  
while True:
    try:
        inputStr = input().strip('\r').strip()
        print(hello, world)
    except EOFError:
        break   
        
#a058: MOD3
while True:
    try:
        b = 0
        c = 0
        d = 0
        inputSize = int(input().strip('\r'))
        for i in range(0,inputSize,1):    
            a=int(input().strip('\r'))
            if a%3==0:
                b += 1
            elif a%3==1:
                c += 1
            else:
                d += 1
        print('%d %d %d' % (b, c, d))
    except EOFError:
        break  
        
#a053: Sagit's 計分程式
while True:
    try:
        score = 0
        inputSrt = int(input().strip('\r'))
        if inputSrt >= 0 and inputSrt <= 10:
            score = 6 * inputSrt
        elif inputSrt>10 and inputSrt<=20:
            score= 60 + 2*(inputSrt-10)
        elif inputSrt>20 and inputSrt<=39:
            score= 80 + 1*(inputSrt-20)
        else:
            score= 100
        print(score)
    except EOFError:
        break      
        
#d460: 山六九之旅
while True:
    try:
        year = int(input().strip('\r'))
        cost = 0
        if year < 6:
            cost = 0
        elif year>=6 and year<12: cost="590" elif="" year="">=12 and year<18: cost="790" elif="" year="">=18 and year<60: break="" cost="" else:="" eoferror:="" except="" pre="" print="">