京东评论词云
爬取评论
import requests
import re
import time
import json
import pandas as pd
content_list = []
for i in range(99):
print('正在爬取第{}页'.format(i+1))
try:
url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=4311178&score=0&sortType=5&page={}&pageSize=50&isShadowSku=0&fold=1'.format(
i)
r = requests.get(url).text
datas = re.findall('comments":(.*)\}', r)[0]
content = re.findall('"content":"(.*?)"', datas)
content_list.extend(content)
time.sleep(3)
except:
print('本页爬取失败')
df = pd.DataFrame()
df['评论'] = content_list
df.to_csv('A400.csv')
生成词云图
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = open('A400.txt', encoding='utf8')
mylist = list(text)
word_list = [' '.join(jieba.cut(i)) for i in mylist]
new_text = ' '.join(word_list)
wordcloud = WordCloud(font_path='./simhei.ttf', background_color='white').generate(new_text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.083 seconds.
Prefix dict has been built successfully.

import jieba
from PIL import Image
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import imageio
import matplotlib.pyplot as plt
from collections import Counter
text = open('A400.txt', encoding='utf8')
mylist = list(text)
word_list = [' '.join(jieba.cut(i)) for i in mylist]
new_text = ' '.join(word_list)
con_words = [x for x in jieba.cut(new_text) if len(x) >= 2 ]
frequencies = Counter(con_words).most_common()
frequencies = dict(frequencies)
pac_mask = Image.open('./apchong.png')
pac = np.array(pac_mask)
wordcloud = WordCloud(font_path='./simhei.ttf',
background_color='white',
max_words=2000,
mask=pac).fit_words(frequencies)
plt.figure(dpi=150)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
