删除停止词
from nltk.corpus import stopwords
tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']
stop_words = stopwords.words('english')
print([word for word in tokenized_words if word not in stop_words])
print(stop_words[:5])
按单词的重要性加权
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = np.array(['I love China, China!', 'Sweden is best', 'Germany beats both'])
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
print(feature_matrix)
print(feature_matrix.toarray())
print(tfidf.vocabulary_)
提取词干
from nltk.stem import PorterStemmer
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']
porter = PorterStemmer()
print([porter.stem(word) for word in tokenized_words])
文本分词
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
string = "Like we used to do"
print(word_tokenize(string))
string = "We don't talk anymore. We don't talk anymore. We don't talk anymore. Like we used to do."
print(sent_tokenize(string))
文本编码为词袋
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
text_data = np.array(['I love Sam. Sam!', 'Sam is best', 'Oh nice'])
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)
print(bag_of_words.toarray())
print(count.get_feature_names())
count_2gram = CountVectorizer(ngram_range=(1, 2), stop_words="english", vocabulary=['sam'])
bag = count_2gram.fit_transform(text_data)
print(bag.toarray())
print(count_2gram.vocabulary_)
标注词性
from nltk import pos_tag
from nltk import word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer
text_data = "Chris loved outdoor running"
text_tagged = pos_tag(word_tokenize(text_data))
print(text_tagged)
'''
NNP:单数专有名词
NN:单数或复数的名词
RB:副词
VBD:过去式的动词
VBG:动名词或动词的现在分词形式
JJ:形容词
PRP:人称代词
'''
print([word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']])
tweets = ["I am eating a burrito for breakfast", "Political science is an amazing field",
"San Francisco is an amazing city"]
tagged_tweets = []
for tweet in tweets:
tweet_tag = pos_tag(word_tokenize(tweet))
tagged_tweets.append([tag for word, tag in tweet_tag])
one_hot_muti = MultiLabelBinarizer()
print(one_hot_muti.fit_transform(tagged_tweets))
print(one_hot_muti.classes_)
文本清洗
import re
text_data = [" Loving him is like driving a new Maserati down a dead end street. ",
"Faster than the wind, passionate as sin, ending so suddenly.",
" Loving him is like trying change to your mind once you're already flying through the free fall. "]
strip_whitespace = [string.strip() for string in text_data]
print(strip_whitespace)
remove_periods = [string.replace(".", "") for string in strip_whitespace]
print(remove_periods)
def capitalizer(string: str) -> str:
return string.upper()
iter1 = iter(capitalizer(string) for string in remove_periods)
for i in iter1:
print(i, end="\n")
def replace_letters_with_X(string: str) -> str:
return re.sub(r"[a-zA-Z]]", "X", string)
iter2 = iter(replace_letters_with_X(string) for string in remove_periods)
for i in iter2:
print(i, end="\n")
标注词性
from nltk.corpus import brown
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
sentences = brown.tagged_sents(categories='news')
train = sentences[:4000]
test = sentences[4000:]
unigram = UnigramTagger(train)
bigram = BigramTagger(train, backoff=unigram)
trigram = TrigramTagger(train, backoff=bigram)
print(trigram.evaluate(test))
移除标点
import unicodedata
import sys
text_data = ['Hi!!!!!I. Love. This Song....,;;', '100000 Agree%%&*%!!! #LoveI T', 'Right?!?!?!']
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
print([string.translate(punctuation) for string in text_data])
解析清洗HTML
from bs4 import BeautifulSoup
string = ""
with open('test.html', 'r', encoding='UTF-8') as f:
lines = f.readlines()
for i in lines:
string = '%s %s' % (string, i)
soup = BeautifulSoup(string, "lxml")
print(soup.find("div", {"id": "b_id"}).text)