#!/usr/bin/env python | |
# coding:utf-8 | |
import os | |
import jieba | |
# Define where we put the files | |
source_dictionary = "/opt/ht/source" | |
keywords_file = "/opt/ht/keywords-new.txt" | |
result_file = "/opt/ht/result.txt" | |
# Convert the Keywords into a list | |
keywords_list = [] | |
f = file(keywords_file) | |
while True: | |
line = f.readline() | |
if len(line) == 0: | |
break | |
keywords_list.append(line) | |
# Deal With the source file | |
file_list = os.listdir(source_dictionary) | |
r_file = open(result_file, "w") | |
for files in file_list: | |
s_article = open(files).read() | |
s_sentence = s_article.split(u"。".encode("utf-8")) | |
for sentences in s_sentence: | |
words_list = list(jieba.cut(sentences,cut_all=False)) | |
result = list(set(words_list) & set(keywords_list)) | |
for words in result: | |
r_file.write("%s;" % words) | |
r_file.write("\n") | |
r_file.write("A New File Start") | |
r_file.close() |
![]() |
1
freeznet 2014-03-24 13:52:14 +08:00
如果能把样本文件发出来应该会更好
|