Skip to content

Commit f50cec1

Browse files
committed
update
1 parent 11cb365 commit f50cec1

File tree

9 files changed

+239
-0
lines changed

9 files changed

+239
-0
lines changed

爬虫/Include/豆瓣2/analyse.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import matplotlib.pyplot as plt
2+
import matplotlib
3+
import jieba
4+
import xlwt
5+
import xlrd
6+
from wordcloud import WordCloud
7+
import numpy as np
8+
from collections import Counter
9+
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
10+
matplotlib.rcParams['axes.unicode_minus'] = False
11+
12+
def anylasescore(comment):
13+
score=[0,0,0,0,0,0]
14+
count=0
15+
for va in comment:
16+
try:
17+
score[int(va[2])]+=1
18+
count+=1
19+
except Exception as e:
20+
continue
21+
print(score)
22+
label='1分','2分','3分','4分','5分'
23+
color = 'blue', 'orange', 'yellow', 'green', 'red' # 各类别颜色
24+
size=[0,0,0,0,0]
25+
explode=[0,0,0,0,0]
26+
for i in range(1,5):
27+
size[i]=score[i]*100/count
28+
explode[i]=score[i]/count/10
29+
pie = plt.pie(size, colors=color, explode=explode, labels=label, shadow=True, autopct='%1.1f%%')
30+
for font in pie[1]:
31+
font.set_size(8)
32+
for digit in pie[2]:
33+
digit.set_size(8)
34+
plt.axis('equal')
35+
plt.title(u'各个评分占比', fontsize=12)
36+
plt.legend(loc=0, bbox_to_anchor=(0.82, 1)) # 图例
37+
# 设置legend的字体大小
38+
leg = plt.gca().get_legend()
39+
ltext = leg.get_texts()
40+
plt.setp(ltext, fontsize=6)
41+
plt.savefig("score.png")
42+
# 显示图
43+
plt.show()
44+
def getzhifang(map):
45+
x=[]
46+
y=[]
47+
for k,v in map.most_common(15):
48+
x.append(k)
49+
y.append(v)
50+
Xi = np.array(x)
51+
Yi = np.array(y)
52+
x = np.arange(0, 15, 1)
53+
width = 0.6
54+
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
55+
plt.figure(figsize=(8, 6)) ##指定图像比例: 8:6
56+
plt.bar(Xi, Yi, width, color='blue', label='热门词频统计', alpha=0.8,)
57+
58+
plt.xlabel("词频")
59+
plt.ylabel("次数")
60+
plt.show()
61+
return
62+
def getciyun_most(map):
63+
x = []
64+
y = []
65+
for k, v in map.most_common(300):
66+
x.append(k)
67+
y.append(v)
68+
xi=x[0:150]
69+
xi=' '.join(xi)
70+
print(xi)
71+
backgroud_Image = plt.imread('nezha.jpg') # 如果需要个性化词云
72+
wc = WordCloud(background_color="white",
73+
width=1500, height=1200,
74+
#min_font_size=40,
75+
mask=backgroud_Image,
76+
font_path="simhei.ttf",
77+
max_font_size=150, # 设置字体最大值
78+
random_state=50, # 设置有多少种随机生成状态,即有多少种配色方案
79+
) # 字体这里有个坑,一定要设这个参数。否则会显示一堆小方框wc.font_path="simhei.ttf" # 黑体
80+
# wc.font_path="simhei.ttf"
81+
my_wordcloud = wc.generate(xi)
82+
plt.imshow(my_wordcloud)
83+
my_wordcloud.to_file("img.jpg")
84+
xi=' '.join(x[150:300])
85+
my_wordcloud = wc.generate(xi)
86+
my_wordcloud.to_file("img2.jpg")
87+
88+
plt.axis("off")
89+
90+
def anylaseword(comment):
91+
commnetstr=''
92+
c = Counter()
93+
low=Counter()
94+
index=0
95+
for va in comment:
96+
seg_list = jieba.cut(va[3],cut_all=False)
97+
index+=1
98+
for x in seg_list:
99+
if len(x) > 1 and x != '\r\n':
100+
try:
101+
c[x]+=1
102+
except:
103+
continue
104+
commnetstr+=va[3]
105+
for (k, v) in c.most_common():
106+
if v<5:
107+
c.pop(k)
108+
continue
109+
#print(k,v)
110+
print(len(c),c)
111+
getzhifang(c)
112+
getciyun_most(c)
113+
#print(commnetstr)
114+
def anylase():
115+
data = xlrd.open_workbook('nezha.xls') # 打开xls文件
116+
table = data.sheets()[0] # 打开第i张表
117+
comment = []
118+
for i in range(1, 500):
119+
comment.append(table.row_values(i))
120+
# print(comment)
121+
anylasescore(comment)
122+
anylaseword(comment)
123+
124+
if __name__ == '__main__':
125+
anylase()
126+
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import urllib.parse
4+
5+
import xlwt
6+
import xlrd
7+
8+
9+
10+
def login(username,password):
11+
url = 'https://accounts.douban.com/j/mobile/login/basic'
12+
header = {
13+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
14+
'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
15+
'Origin': 'https://accounts.douban.com',
16+
'content-Type': 'application/x-www-form-urlencoded',
17+
'x-requested-with': 'XMLHttpRequest',
18+
'accept': 'application/json',
19+
'accept-encoding': 'gzip, deflate, br',
20+
'accept-language': 'zh-CN,zh;q=0.9',
21+
'connection': 'keep-alive'
22+
, 'Host': 'accounts.douban.com'
23+
}
24+
data = {
25+
'ck': '',
26+
'name': '',
27+
'password': '',
28+
'remember': 'false',
29+
'ticket': ''
30+
}
31+
data['name']=username
32+
data['password']=password
33+
data=urllib.parse.urlencode(data)
34+
print(data)
35+
req=requests.post(url,headers=header,data=data,verify=False)
36+
cookies = requests.utils.dict_from_cookiejar(req.cookies)
37+
print(cookies)
38+
return cookies
39+
def getcomment(cookies):
40+
start=0
41+
w = xlwt.Workbook(encoding='ascii')
42+
ws = w.add_sheet('sheet1')
43+
index=1
44+
while True:
45+
try:
46+
url = 'https://movie.douban.com/subject/26794435/comments?start='+str(start)+'&limit=20&sort=new_score&status=P&comments_only=1'
47+
start+=20
48+
req = requests.get(url,cookies=cookies)
49+
res = req.json()
50+
res=res['html']
51+
soup = BeautifulSoup(res, 'lxml')
52+
node = soup.select('.comment-item')
53+
#print(node[0])
54+
for va in node:
55+
name = va.a.get('title')
56+
star = va.select_one('.comment-info').select('span')[1].get('class')[0][-2]
57+
comment = va.select_one('.short').text
58+
print(name, star, comment)
59+
ws.write(index,0,index)
60+
ws.write(index, 1, name)
61+
ws.write(index, 2, star)
62+
ws.write(index, 3, comment)
63+
index+=1
64+
65+
except Exception as e:
66+
print(e)
67+
break
68+
w.save('nezha.xls')
69+
70+
71+
72+
73+
if __name__ == '__main__':
74+
75+
cookies=login('15751512041','52cuihuini')
76+
getcomment(cookies)
77+
78+
79+
80+

爬虫/Include/豆瓣2/img.jpg

167 KB
Loading

爬虫/Include/豆瓣2/img2.jpg

170 KB
Loading

爬虫/Include/豆瓣2/login.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import requests
2+
import urllib.parse
3+
from http import cookiejar
4+
5+
url='https://accounts.douban.com/j/mobile/login/basic'
6+
header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
7+
'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
8+
'Origin': 'https://accounts.douban.com',
9+
'content-Type':'application/x-www-form-urlencoded',
10+
'x-requested-with':'XMLHttpRequest',
11+
'accept':'application/json',
12+
'accept-encoding':'gzip, deflate, br',
13+
'accept-language':'zh-CN,zh;q=0.9',
14+
'connection': 'keep-alive'
15+
,'Host': 'accounts.douban.com'
16+
}
17+
data={
18+
'ck':'',
19+
'name':'',
20+
'password':'',
21+
'remember':'false',
22+
'ticket':''
23+
}
24+
def login(username,password):
25+
global data
26+
data['name']=username
27+
data['password']=password
28+
data=urllib.parse.urlencode(data)
29+
print(data)
30+
req=requests.post(url,headers=header,data=data,verify=False)
31+
cookies = requests.utils.dict_from_cookiejar(req.cookies)
32+
print(cookies)
33+
return cookies

爬虫/Include/豆瓣2/nezha.jpg

90.9 KB
Loading

爬虫/Include/豆瓣2/nezha.png

90.9 KB
Loading

爬虫/Include/豆瓣2/nezha.xls

159 KB
Binary file not shown.

爬虫/Include/豆瓣2/score.png

29.6 KB
Loading

0 commit comments

Comments
 (0)