1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
| import jieba import jieba.analyse import os, sys import sklearn.feature_extraction import sklearn.naive_bayes as nb import sklearn.externals.joblib as jl from sklearn import metrics import xlrd
reload(sys) sys.setdefaultencoding('utf8')
def calculate_result(actual,pred): m_precision = metrics.precision_score(actual,pred); m_recall = metrics.recall_score(actual,pred); print 'predict info:' print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall); print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred));
train_set = []
jieba.load_userdict("userdict.txt")
stopwords = [line.strip().decode('utf-8') for line in open('./stopwords.txt')] stopwords.append(' ')
kvlist = [] targetlist = []
gnb = nb.MultinomialNB(alpha = 0.01) fh = sklearn.feature_extraction.FeatureHasher(n_features=15000,non_negative=True,input_type='string')
data = xlrd.open_workbook('./data/train.csv.xls') table = data.sheets()[0] nrows = table.nrows ncols = table.ncols
for r in range(1, nrows): line = table.row_values(r) if line[1] != '': targetlist += [int(line[1])] wordlist = filter(lambda _: _ not in stopwords, jieba.cut(line[0], cut_all = False)) kvlist += [ [ i for i in wordlist ] ]
testlist = [] testtargetlist = [] data = xlrd.open_workbook('./data/test.xls') table = data.sheets()[0] nrows = table.nrows ncols = table.ncols
for r in range(1, nrows): line = table.row_values(r) if line[1] != '': testtargetlist += [int(line[1])] wordlist = filter(lambda _: _ not in stopwords, jieba.cut(line[0], cut_all = False)) testlist += [ [ i for i in wordlist ] ]
print '*************************\nNB\n*************************' X = fh.fit_transform(kvlist) testX = fh.fit_transform(testlist)
gnb.fit(X,targetlist) result = gnb.predict(testX)
calculate_result(testtargetlist, result)
from sklearn.svm import SVC print '*************************\nSVM\n*************************' svclf = SVC(kernel = 'linear') svclf.fit(X,targetlist) pred = svclf.predict(testX); calculate_result(testtargetlist,pred);
walk = os.walk('./data/2016') for root, dirs, files in walk: for name in files: f = open(os.path.join(root, name), 'r') raw = f.read() word_list = filter(lambda _: _ not in stopwords, jieba.cut(raw, cut_all = False)) kvlist = [ [ i for i in word_list ] ] X = fh.fit_transform(kvlist) pred = svclf.predict(X) print raw print pred
|