京东评论词云

爬取评论

import requests
import re
import time
import json
import pandas as pd

content_list = []
for i in range(99):
    print('正在爬取第{}页'.format(i+1))
    try:
        url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=4311178&score=0&sortType=5&page={}&pageSize=50&isShadowSku=0&fold=1'.format(
            i)
        r = requests.get(url).text
        datas = re.findall('comments":(.*)\}', r)[0]
        content = re.findall('"content":"(.*?)"', datas)
        content_list.extend(content)
        time.sleep(3)
    except:
        print('本页爬取失败')

df = pd.DataFrame()
df['评论'] = content_list
df.to_csv('A400.csv')

生成词云图

import jieba 
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = open('A400.txt', encoding='utf8')
mylist = list(text)

word_list = [' '.join(jieba.cut(i)) for i in mylist]
new_text = ' '.join(word_list)
wordcloud = WordCloud(font_path='./simhei.ttf', background_color='white').generate(new_text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.083 seconds.
Prefix dict has been built successfully.

# 根据词频
import jieba
from PIL import Image
import numpy as np
from wordcloud import WordCloud, STOPWORDS
# from scipy.misc import imread
import imageio
import matplotlib.pyplot as plt

from collections import Counter

text = open('A400.txt', encoding='utf8')
mylist = list(text)
word_list = [' '.join(jieba.cut(i)) for i in mylist]
new_text = ' '.join(word_list)

con_words = [x for x in jieba.cut(new_text) if len(x) >= 2 ]
frequencies = Counter(con_words).most_common()
frequencies = dict(frequencies)

# pck_mask = imageio.imread('./apchong.png')
pac_mask = Image.open('./apchong.png')
pac = np.array(pac_mask)


wordcloud = WordCloud(font_path='./simhei.ttf',
                      background_color='white', 
                      max_words=2000,
                      mask=pac).fit_words(frequencies)  #generate_from_text
plt.figure(dpi=150)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# wordcloud.to_file('wcnh2.png')
# plt.imshow(wordcloud)
# plt.axis("off")
# plt.figure()