Text-分析步骤
一些可能用到的libraries
from bs4 import BeautifulSoup as bsoup
import re
import os
import nltk
from nltk.collocations import *
from itertools import chain
import itertools
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import MWETokenizer
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import reuters
Generate the 100 bigram cllocations:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
bigram_finder.apply_freq_filter(2)
bigram_finder.apply_word_filter(lambda w: len(w) < 3)
top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100) # Top-100 bigrams
Generate the TF-IDF vectors:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(input = 'content', analyzer = 'word')
tfidf_vectors = tfidf_vectorizer.fit_transform(patent_words)
或者:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer = "word")
tfs = tfidf.fit_transform([' '.join(value) for value in tokenized_reuters.values()])
vocab = vectorizer.get_feature_names()
for word, weight in zip(vocab, tfs.toarray()[0]):
if weight > 0:
print (word, ":", weight)
输出到txt文档:
save_file = open("patent_student.txt", 'w')
vocab = tfidf_vectorizer.get_feature_names()
cx = tfidf_vectors.tocoo() #Return the coordinate representation of a sparse matrix
for i,j,v in itertools.zip_longest(cx.row, cx.col, cx.data):
save_file.write(pids[i] + ',' + vocab[j] + ',' + str(v) + '\n')
Most common words
1.出现次数最多
from nltk.probability import *
fd_1 = FreqDist(words)
fd_1.most_common(25)
2.出现文章最多
words_2 = list(chain.from_iterable([set(value) for value in tokenized_reuters.values()]))
fd_2 = FreqDist(words_2)
fd_2.most_common(25)
3.出现次数少的词
lessFreqWords = set([k for k, v in fdist.items() if v < 2])
或者:
lessFreqWords = set(fd_3.hapaxes())
def removeLessFreqWords(fileid):
return (fileid, [w for w in tokenized_reuters[fileid] if w not in lessFreqWords])
查看某个词所出现的地方
nltk.Text(reuters.words()).concordance('net')
Creating Count Vectors
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word")
data_features = vectorizer.fit_transform([' '.join(value) for value in
tokenized_reuters.values()])
vocab2 = vectorizer.get_feature_names()
for word, count in zip(vocab, data_features.toarray()[0]):
if count > 0:
print (word, ":", count)
提取二元组
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words)
bigram_finder.apply_freq_filter(2)
bigram_finder.apply_word_filter(lambda w: len(w) < 3)
top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100) # Top-100 bigrams
或者:
from nltk.util import ngrams
bigrams = ngrams(reuters.words(), n = 2)
fdbigram = FreqDist(bigrams)
fdbigram.most_common()
The following code will find the best 100 bigrams using the PMI scores:(PMI找出最合适的bigram)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(reuters.words())
finder.nbest(bigram_measures.pmi, 50)