Use jieba word segmentation and remove the stop word process program


1 Build two files of undivided word files and divided word files, and define the file names of the undivided folder by category, each category Multiple files that require word segmentation can be placed under the folder.

2 Prepare a stop word (jieba itself should be no stop words)

3 Customize the dictionary according to business needs (here use jieba's own dictionary)


@file: Word segmentation stop
@Time: 2018/08/27
#本程序 is mainly used for jieba participle, and remove stop words

Import os
Import jieba

#Save file function
Def savefile(savepath,content):
    Fp = open(savepath,'w',encoding='utf8',errors='ignore')

#Read file function
Def readfile(path):
    Fp = open(path, "r", encoding='utf8', errors='ignore')
    Content =
    Return content

## Remove 2 functions of stop words
#Create stop word list
Def stopwordslist(filepath):
    Stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    Return stopwords

# Remove the stop word from the sentence
Def movestopwords(sentence):
    Stopwords = stopwordslist('stop_words.txt') # here load the path to the stop word
    Outstr = ''
    For word in sentence:
        If word not in stopwords:
            If word != '\t'and'\n':
                Outstr += word
                # outstr += " "
    Return outstr

If __name__ == '__main__':

    Corpus_path = "corpus /train/" #未分词分类information library path
    Seg_path = "corpus /train_seg/" # 分分分分类 corpus path

    Catelist = os.listdir(corpus_path) # Get all subdirectories under the undivided directory
    For mydir in catelist:
        Class_path = corpus_path + mydir + "/" # spell out the path to the subdirectory
        Seg_dir = seg_path + mydir + "/" # Predict the category after spelling out the word segmentation
        If not os.path.exists(seg_dir): # exists or not, creates if it does not exist

        File_list = os.listdir(class_path) # List all files in the current directory
        For file_path in file_list:
            Fullname = class_path + file_path # path + filename
            Print("The currently processed file is: ",fullname) # 语/train/pos/pos.txt

            Content = readfile(fullname).strip() # Read the contents of the file
            Content = content.replace("\n", "").strip() # Remove line breaks and extra spaces
            Content_seg = jieba.cut(content) # jieba分词
            Print("jieba after the word:", content_seg)
            Listcontent = ''
            For i in content_seg:
                Listcontent += i
                Listcontent += " "
                # listcontent.replace(' ','\n').replace(' ','\n')
            Listcontent = movestopwords(listcontent) # Remove stop words
            Print("After removing the stopword:", listcontent[0:10])
            Listcontent = listcontent.replace(" ", "\n").replace(" ", "\n")
            Savefile(seg_dir + file_path, "".join(listcontent)) # save