JavaGle
diff --git a/‎爬虫/Include/豆瓣2/analyse.py‎
Lines changed: 126 additions & 0 deletions b/‎爬虫/Include/豆瓣2/analyse.py‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎爬虫/Include/豆瓣2/get_comment.py‎
Lines changed: 80 additions & 0 deletions b/‎爬虫/Include/豆瓣2/get_comment.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎爬虫/Include/豆瓣2/img.jpg‎
167 KB b/‎爬虫/Include/豆瓣2/img.jpg‎
167 KB
diff --git a/‎爬虫/Include/豆瓣2/img2.jpg‎
170 KB b/‎爬虫/Include/豆瓣2/img2.jpg‎
170 KB
diff --git a/‎爬虫/Include/豆瓣2/login.py‎
Lines changed: 33 additions & 0 deletions b/‎爬虫/Include/豆瓣2/login.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎爬虫/Include/豆瓣2/nezha.jpg‎
90.9 KB b/‎爬虫/Include/豆瓣2/nezha.jpg‎
90.9 KB
diff --git a/‎爬虫/Include/豆瓣2/nezha.png‎
90.9 KB b/‎爬虫/Include/豆瓣2/nezha.png‎
90.9 KB
diff --git a/‎爬虫/Include/豆瓣2/nezha.xls‎
159 KB b/‎爬虫/Include/豆瓣2/nezha.xls‎
159 KB
diff --git a/‎爬虫/Include/豆瓣2/score.png‎
29.6 KB b/‎爬虫/Include/豆瓣2/score.png‎
29.6 KB
@@ -0,0 +1,126 @@
+import matplotlib.pyplot as plt
+import matplotlib
+import jieba
+import xlwt
+import xlrd
+from wordcloud import WordCloud
+import numpy as np
+from collections import Counter
+matplotlib.rcParams['font.sans-serif'] = ['SimHei']
+matplotlib.rcParams['axes.unicode_minus'] = False
+
+def anylasescore(comment):
+    score=[0,0,0,0,0,0]
+    count=0
+    for va in comment:
+        try:
+            score[int(va[2])]+=1
+            count+=1
+        except Exception as e:
+            continue
+    print(score)
+    label='1分','2分','3分','4分','5分'
+    color = 'blue', 'orange', 'yellow', 'green', 'red'  # 各类别颜色
+    size=[0,0,0,0,0]
+    explode=[0,0,0,0,0]
+    for i in range(1,5):
+        size[i]=score[i]*100/count
+        explode[i]=score[i]/count/10
+    pie = plt.pie(size, colors=color, explode=explode, labels=label, shadow=True, autopct='%1.1f%%')
+    for font in pie[1]:
+        font.set_size(8)
+    for digit in pie[2]:
+        digit.set_size(8)
+    plt.axis('equal')
+    plt.title(u'各个评分占比', fontsize=12)
+    plt.legend(loc=0, bbox_to_anchor=(0.82, 1))  # 图例
+    # 设置legend的字体大小
+    leg = plt.gca().get_legend()
+    ltext = leg.get_texts()
+    plt.setp(ltext, fontsize=6)
+    plt.savefig("score.png")
+    # 显示图
+    plt.show()
+def getzhifang(map):
+    x=[]
+    y=[]
+    for k,v in map.most_common(15):
+        x.append(k)
+        y.append(v)
+    Xi = np.array(x)
+    Yi = np.array(y)
+    x = np.arange(0, 15, 1)
+    width = 0.6
+    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
+    plt.figure(figsize=(8, 6))  ##指定图像比例： 8：6
+    plt.bar(Xi, Yi, width, color='blue', label='热门词频统计', alpha=0.8,)
+
+    plt.xlabel("词频")
+    plt.ylabel("次数")
+    plt.show()
+    return
+def getciyun_most(map):
+    x = []
+    y = []
+    for k, v in map.most_common(300):
+        x.append(k)
+        y.append(v)
+    xi=x[0:150]
+    xi=' '.join(xi)
+    print(xi)
+    backgroud_Image = plt.imread('nezha.jpg')  # 如果需要个性化词云
+    wc = WordCloud(background_color="white",
+                   width=1500, height=1200,
+                   #min_font_size=40,
+                   mask=backgroud_Image,
+                   font_path="simhei.ttf",
+                   max_font_size=150,  # 设置字体最大值
+                   random_state=50,  # 设置有多少种随机生成状态，即有多少种配色方案
+                   )  # 字体这里有个坑，一定要设这个参数。否则会显示一堆小方框wc.font_path="simhei.ttf"   # 黑体
+    # wc.font_path="simhei.ttf"
+    my_wordcloud = wc.generate(xi)
+    plt.imshow(my_wordcloud)
+    my_wordcloud.to_file("img.jpg")
+    xi=' '.join(x[150:300])
+    my_wordcloud = wc.generate(xi)
+    my_wordcloud.to_file("img2.jpg")
+
+    plt.axis("off")
+
+def anylaseword(comment):
+    commnetstr=''
+    c = Counter()
+    low=Counter()
+    index=0
+    for va in comment:
+        seg_list = jieba.cut(va[3],cut_all=False)
+        index+=1
+        for x in seg_list:
+            if len(x) > 1 and x != '\r\n':
+                 try:
+                    c[x]+=1
+                 except:
+                     continue
+        commnetstr+=va[3]
+    for (k, v) in c.most_common():
+        if v<5:
+            c.pop(k)
+            continue
+        #print(k,v)
+    print(len(c),c)
+    getzhifang(c)
+    getciyun_most(c)
+    #print(commnetstr)
+def anylase():
+    data = xlrd.open_workbook('nezha.xls')  # 打开xls文件
+    table = data.sheets()[0]  # 打开第i张表
+    comment = []
+    for i in range(1, 500):
+        comment.append(table.row_values(i))
+    # print(comment)
+    anylasescore(comment)
+    anylaseword(comment)
+
+if __name__ == '__main__':
+    anylase()
+
@@ -0,0 +1,80 @@
+import requests
+from bs4 import BeautifulSoup
+import urllib.parse
+
+import xlwt
+import xlrd
+
+
+
+def login(username,password):
+    url = 'https://accounts.douban.com/j/mobile/login/basic'
+    header = {
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
+        'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
+        'Origin': 'https://accounts.douban.com',
+        'content-Type': 'application/x-www-form-urlencoded',
+        'x-requested-with': 'XMLHttpRequest',
+        'accept': 'application/json',
+        'accept-encoding': 'gzip, deflate, br',
+        'accept-language': 'zh-CN,zh;q=0.9',
+        'connection': 'keep-alive'
+        , 'Host': 'accounts.douban.com'
+        }
+    data = {
+        'ck': '',
+        'name': '',
+        'password': '',
+        'remember': 'false',
+        'ticket': ''
+    }
+    data['name']=username
+    data['password']=password
+    data=urllib.parse.urlencode(data)
+    print(data)
+    req=requests.post(url,headers=header,data=data,verify=False)
+    cookies = requests.utils.dict_from_cookiejar(req.cookies)
+    print(cookies)
+    return cookies
+def getcomment(cookies):
+    start=0
+    w = xlwt.Workbook(encoding='ascii')
+    ws = w.add_sheet('sheet1')
+    index=1
+    while True:
+        try:
+            url = 'https://movie.douban.com/subject/26794435/comments?start='+str(start)+'&limit=20&sort=new_score&status=P&comments_only=1'
+            start+=20
+            req = requests.get(url,cookies=cookies)
+            res = req.json()
+            res=res['html']
+            soup = BeautifulSoup(res, 'lxml')
+            node = soup.select('.comment-item')
+            #print(node[0])
+            for va in node:
+                name = va.a.get('title')
+                star = va.select_one('.comment-info').select('span')[1].get('class')[0][-2]
+                comment = va.select_one('.short').text
+                print(name, star, comment)
+                ws.write(index,0,index)
+                ws.write(index, 1, name)
+                ws.write(index, 2, star)
+                ws.write(index, 3, comment)
+                index+=1
+
+        except Exception as  e:
+            print(e)
+            break
+    w.save('nezha.xls')
+
+
+
+
+if __name__ == '__main__':
+
+    cookies=login('15751512041','52cuihuini')
+    getcomment(cookies)
+
+
+
+
@@ -0,0 +1,33 @@
+import  requests
+import urllib.parse
+from http import cookiejar
+
+url='https://accounts.douban.com/j/mobile/login/basic'
+header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
+'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
+        'Origin': 'https://accounts.douban.com',
+ 'content-Type':'application/x-www-form-urlencoded',
+ 'x-requested-with':'XMLHttpRequest',
+ 'accept':'application/json',
+ 'accept-encoding':'gzip, deflate, br',
+ 'accept-language':'zh-CN,zh;q=0.9',
+ 'connection': 'keep-alive'
+ ,'Host': 'accounts.douban.com'
+ }
+data={
+    'ck':'',
+    'name':'',
+    'password':'',
+    'remember':'false',
+    'ticket':''
+}
+def login(username,password):
+    global  data
+    data['name']=username
+    data['password']=password
+    data=urllib.parse.urlencode(data)
+    print(data)
+    req=requests.post(url,headers=header,data=data,verify=False)
+    cookies = requests.utils.dict_from_cookiejar(req.cookies)
+    print(cookies)
+    return cookies