Skip to content

Commit 25c5bb8

Browse files
committed
add
1 parent c20dc7c commit 25c5bb8

File tree

18 files changed

+658
-46
lines changed

18 files changed

+658
-46
lines changed

matplp/jieba/image/jishu.png

-232 KB
Binary file not shown.

爬虫/Include/csdn/mobai.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
header = {
4+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
5+
'Cookie':'acw_tc=2760824715607559510924771ef86ea31609a042abdeb32a42b51c684bb64f; x-zp-client-id=1b9430ab-2602-4ed8-b8a2-5bdb7d6a0e78; isShowSalary=t; select_city_code=489; select_city_name=%E5%85%A8%E5%9B%BD; sajssdk_2015_cross_new_user=1; Hm_lvt_08e585d395455886ebe17d4b393b6523=1560755962; isShowDownload=f; Hm_lpvt_08e585d395455886ebe17d4b393b6523=1560756695; sts_deviceid=16b6474cb3a3b9-09c17de8e4bef4-1a29140e-2073600-16b6474cb3b71a; jobRiskWarning=true; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216b644df12c433-0e9727c003282c-207a2549-341200-16b644df12d80f%22%2C%22%24device_id%22%3A%2216b644df12c433-0e9727c003282c-207a2549-341200-16b644df12d80f%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; ZPCITIESCLICKED=|635; LastCity=%E5%8D%97%E4%BA%AC; LastCity%5Fid=635; sts_sg=1; sts_sid=16b6474f17e8d6-068f25c51e25b7-1a29140e-2073600-16b6474f17fd1b; sts_chnlsid=Unknown; zp_src_url=https%3A%2F%2Fcompany.zhaopin.com%2FCZ219167780.htm; sou_experiment=unexperiment; ZP_OLD_FLAG=false; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1560758528; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1560758528; ZL_REPORT_GLOBAL={%22company%22:{%22actionid%22:%22a40af7db-6f0c-47ba-a386-f05e858a61ca-company%22%2C%22funczone%22:%22hiring_jd%22}%2C%22//www%22:{%22seid%22:%22%22%2C%22actionid%22:%2228f00cc7-ac8d-49d4-92ed-c61c700bff80-cityPage%22}%2C%22sou%22:{%22actionid%22:%223fd31c57-25b6-4e69-9dd4-91708cd522c3-sou%22%2C%22funczone%22:%22smart_matching%22}}; sts_evtseq=9'
6+
}
7+
8+
data={"pageIndex":7,"pageSize":20,"S_SOU_FULL_INDEX":"python","S_SOU_WORK_CITY":"538","at":"","rt":"","platform":7,"d":"b89414a2-9303-4681-8939-d78cf1722eed","channel":""}
9+
url='https://m.zhaopin.com/api/capi?capiUrl=position/search&x-zp-page-request-id=1faaa9a9e31b48408727df14500c75f9-1560756694516-139343&x-zp-client-id=1b9430ab-2602-4ed8-b8a2-5bdb7d6a0e78'
10+
req=requests.post(url,data=data,headers=header)
11+
res=req.json()
12+
print(res)

爬虫/Include/csdn/test.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
from bs4 import BeautifulSoup
44

55
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
6-
6+
###
7+
def getpage(url):
8+
pass
9+
710

811
#获取访客,粉丝等基础信息
912
def getbaseinfor(url):
@@ -12,11 +15,11 @@ def getbaseinfor(url):
1215
soup=BeautifulSoup(res,'lxml')
1316
node=soup.find(id="asideProfile")
1417
count=node.dd.span.text#总文章数
15-
fan=node.select("#fanBox")[0].get('title')#粉丝数
18+
fan=node.select("#fanBox")[0].get('title') #粉丝数
1619
love=soup.find(attrs={'class':'data-info d-flex item-tiling'}).find_all("dl")[2].get('title')#喜欢
1720
comment=soup.find(attrs={'class':'data-info d-flex item-tiling'}).find_all("dl")[3].get('title')#评论数量
1821

19-
node=node.find(attrs={'class':'grade-box clearfix'}).find_all('dl')
22+
node=node.find(attrs={'class': 'grade-box clearfix'}).find_all('dl')
2023
visit=node[1].dd.get('title')
2124
jifen=node[2].dd.get('title')
2225
rank=node[3].get('title')

爬虫/Include/csdn/test3.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import urllib.request,urllib.parse
2+
import json,re
3+
4+
url = "https://fe-api.zhaopin.com/c/i/sou?"
5+
kw_work = input("请输入您想查找的工作的关键字:")
6+
city = input("请输入您想选择的城市:")
7+
start_page = int(input("请输入开始爬取的页:"))
8+
end_page = int(input("请输入结束爬取的页:"))
9+
for page in range(start_page,end_page+1):
10+
data = {
11+
'start': page,
12+
'pageSize': '60',
13+
'cityId': city,
14+
'salary': '0,0',
15+
'workExperience': '-1',
16+
'education': '-1',
17+
'companyType': '-1',
18+
'jobWelfareTag': '-1',
19+
'kw': kw_work, # 输入搜索的关键字
20+
'kt': '3',
21+
'': '0',
22+
'_v': '0.08095475',
23+
'x-zp-page-request-id': 'a5a5b670d31c43b79fad5a8d98622136-1556194064568-484956'
24+
}
25+
url_now = url + urllib.parse.urlencode(data) # 得到信息真实地址
26+
headers = {
27+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
28+
}
29+
request = urllib.request.Request(url=url_now, headers=headers)
30+
response = urllib.request.urlopen(request) # 发送请求,获取响应
31+
real_data = json.loads(response.read().decode()) # 此时real_data的类型为dict型
32+
print(real_data)
33+
# for data in real_data['data']['results']:
34+
# data_list = []
35+
# job_name = data['jobName'] # 工作名称
36+
# data_list.append(job_name)
37+
# job_salary = data['salary'] # 工作薪水
38+
# data_list.append(job_salary)
39+
# job_welfare = json.loads(data['positionLabel'])['jobLight'] # 此处与上面不同,必须先将其转化成字典之后再取值
40+
# data_list.append(job_welfare)
41+
# job_experence = data['workingExp']['name'] # 工作经验
42+
# data_list.append(job_experence)
43+
# job_eduLevel = data['eduLevel']['name'] # 学业水平
44+
# data_list.append(job_eduLevel)
45+
# job_company = data['company']['name'] # 公司名
46+
# data_list.append(job_company)
47+
# job_companytype = data['company']['type']['name'] # 公司性质
48+
# data_list.append(job_companytype)
49+
# job_url = data['positionURL'] # 详细的网站
50+
# data_list.append(job_url)
51+
# # 创建一个txt文件,将数据写入,或者也可以创建一个Excel表格将其写入,这里就不再举例
52+
# with open('data.txt', 'a')as f:
53+
# f.write(str(data_list))
54+
# f.write("\n")
55+
# f.close()
56+
print("爬取成功!")

爬虫/Include/csdn/yx.xls

155 KB
Binary file not shown.

爬虫/Include/csdn/zhilia.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import urllib.request
2+
import json
3+
# 提取json格式信息的库
4+
import jsonpath
5+
# 存取进xls文件的库
6+
import xlwt
7+
8+
n = 0
9+
myxls = xlwt.Workbook()
10+
sheet1 = myxls.add_sheet(u'yx', cell_overwrite_ok=True)
11+
# write(i,j,value)存取文档的首行
12+
sheet1.write(0, 1, "公司名")
13+
sheet1.write(0, 2, "地区")
14+
sheet1.write(0, 3, "公司人数")
15+
sheet1.write(0, 4, "类型")
16+
sheet1.write(0, 5, "公司网站")
17+
sheet1.write(0, 6, "岗位需求")
18+
sheet1.write(0, 7, "要求毕业性质")
19+
sheet1.write(0, 8, "薪资")
20+
sheet1.write(0, 9, "工作性质")
21+
sheet1.write(0, 10, "福利")
22+
23+
for i in range(1, 10):
24+
url3 = "https://fe-api.zhaopin.com/c/i/sou?start=" + str(
25+
i * 90) + "&pageSize=90&cityId=530&industry=160400&workExperience=-1&education=4&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=java&kt=3&_v=0.20600649&x-zp-page-request-id=a0a5c8da8e5e455ca30312a4d85fa52d-1548559285341-380683"
26+
req = urllib.request.Request(url3)
27+
req.add_header("User-Agent",
28+
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 QIHU 360SE")
29+
data = urllib.request.urlopen(req).read()
30+
data = json.loads(data)
31+
jobName = jsonpath.jsonpath(data, '$..jobName')
32+
searchTag = jsonpath.jsonpath(data, "$..searchTag")
33+
34+
company = jsonpath.jsonpath(data, "$..company")
35+
companyName = jsonpath.jsonpath(data, "$..company.name")
36+
companyPeopleNum = jsonpath.jsonpath(data, "$..company.size.name")
37+
companyType = jsonpath.jsonpath(data, "$..company.type.name")
38+
companyUrl = jsonpath.jsonpath(data, "$..company.url")
39+
40+
city = jsonpath.jsonpath(data, "$..city")
41+
cityName = jsonpath.jsonpath(data, "$..city.display")
42+
43+
workingExp = jsonpath.jsonpath(data, "$..workingExp")
44+
workingExpName = jsonpath.jsonpath(data, "$..workingExp.name")
45+
46+
jobType = jsonpath.jsonpath(data, "$..jobType")
47+
jobTypeName = jsonpath.jsonpath(data, "$..jobType.display")
48+
49+
eduLevel = jsonpath.jsonpath(data, "$..eduLevel")
50+
eduLevelName = jsonpath.jsonpath(data, "$..eduLevel.name")
51+
52+
welfare = jsonpath.jsonpath(data, "$..welfare")
53+
salary = jsonpath.jsonpath(data, "$..salary")
54+
emplType = jsonpath.jsonpath(data, "$..emplType")
55+
jobTag = jsonpath.jsonpath(data, "$..jobTag.searchTag")
56+
57+
for i in range(0, 89):
58+
print("公司编号:" + str(n))
59+
print(companyName[i])
60+
print(cityName[i])
61+
print(companyPeopleNum[i])
62+
print(companyType[i])
63+
print(companyUrl[i])
64+
print(workingExpName[i])
65+
# print(jobTypeName[i])
66+
print(eduLevelName[i])
67+
# print(welfare[i])
68+
print(salary[i])
69+
print(emplType[i])
70+
# print(jobTag[i])
71+
print()
72+
n = n + 1
73+
sheet1.write(n, 0, n)
74+
sheet1.write(n, 1, companyName[i])
75+
sheet1.write(n, 2, cityName[i])
76+
sheet1.write(n, 3, companyPeopleNum[i])
77+
sheet1.write(n, 4, companyType[i])
78+
sheet1.write(n, 5, companyUrl[i])
79+
# sheet1.write(n, 6, jobTypeName[i])
80+
sheet1.write(n, 7, eduLevelName[i])
81+
# sheet1.write(n,8,welfare[i])
82+
sheet1.write(n, 8, salary[i])
83+
sheet1.write(n, 9, emplType[i])
84+
# sheet1.write(n, 10, jobTag[i])
85+
86+
myxls.save('yx.xls')

爬虫/Include/selenium/baidu.png

-3.38 KB
Loading

爬虫/Include/selenium/itcast.png

-357 KB
Binary file not shown.

爬虫/Include/selenium/seleniumtest.py

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,42 +21,43 @@
2121

2222
# 生成当前页面快照并保存
2323
driver.save_screenshot("baidu.png")
24-
24+
time.sleep(3)
2525
# id="kw"是百度搜索输入框,输入字符串"长城"
26-
driver.find_element_by_id("kw").send_keys(u"印正荣")
2726

28-
# id="su"是百度搜索按钮,click() 是模拟点击
27+
driver.find_element_by_id("kw").send_keys("印正荣")
28+
29+
# # id="su"是百度搜索按钮,click() 是模拟点击
2930
driver.find_element_by_id("su").click()
3031
time.sleep(2)
31-
# 获取新的页面快照
32-
driver.save_screenshot("我儿.png")
32+
# # 获取新的页面快照
33+
# driver.save_screenshot("我儿.png")
3334

34-
# 打印网页渲染后的源代码
35-
#print (driver.page_source)
35+
# # 打印网页渲染后的源代码
36+
# #print (driver.page_source)
3637

37-
# 获取当前页面Cookie
38-
print (driver.get_cookies())
38+
# # 获取当前页面Cookie
39+
# print (driver.get_cookies())
3940

40-
# ctrl+a 全选输入框内容
41-
driver.find_element_by_id("kw").send_keys(Keys.CONTROL,'a')
41+
# # ctrl+a 全选输入框内容
42+
# driver.find_element_by_id("kw").send_keys(Keys.CONTROL,'a')
4243

43-
# ctrl+x 剪切输入框内容
44-
driver.find_element_by_id("kw").send_keys(Keys.CONTROL,'x')
44+
# # ctrl+x 剪切输入框内容
45+
# driver.find_element_by_id("kw").send_keys(Keys.CONTROL,'x')
4546

46-
# 输入框重新输入内容
47-
driver.find_element_by_id("kw").send_keys("赛哥哥")
48-
time.sleep(2)
49-
# 模拟Enter回车键
50-
driver.find_element_by_id("su").send_keys(Keys.RETURN)
47+
# # 输入框重新输入内容
48+
# driver.find_element_by_id("kw").send_keys("赛哥哥")
49+
# time.sleep(2)
50+
# # 模拟Enter回车键
51+
# driver.find_element_by_id("su").send_keys(Keys.RETURN)
5152

52-
# 清除输入框内容
53-
driver.find_element_by_id("kw").clear()
53+
# # 清除输入框内容
54+
# driver.find_element_by_id("kw").clear()
5455

55-
# 生成新的页面快照
56-
driver.save_screenshot("itcast.png")
56+
# # 生成新的页面快照
57+
# driver.save_screenshot("itcast.png")
5758

58-
# 获取当前url
59-
print (driver.current_url)
59+
# # 获取当前url
60+
# print (driver.current_url)
6061

6162
# 关闭当前页面,如果只有一个页面,会关闭浏览器
6263
# driver.close()

爬虫/Include/selenium/test.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from selenium import webdriver
2+
driver = webdriver.Chrome()
3+
driver.get("https://www.baidu.com/")
4+
print(driver.title)
5+
6+
7+
import time
8+
import requests
9+
10+
from selenium import webdriver
11+
from bs4 import BeautifulSoup
12+
13+
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
14+
headers = {'User-Agent': user_agent}
15+
#事先在百度输入框中搜索要下载的图片,取出链接地址。这里搜索的是"证件照"
16+
httpUrl = "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1526001481384_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&word=%E7%99%BB%E8%AE%B0%E7%85%A7"
17+
18+
19+
def main():
20+
driver = webdriver.Chrome()
21+
driver.get(httpUrl)
22+
23+
soup = BeautifulSoup(driver.page_source, "html.parser")
24+
imglist = soup.find_all("img", {'class': 'main_img img-hover'}) # 内容
25+
x = 0
26+
for img in imglist:
27+
print(img['data-imgurl'])
28+
saveImg(img['data-imgurl'], x)
29+
x += 1
30+
driver.close()
31+
32+
33+
def saveImg(pic_link, x):
34+
path = "img/" # 存储路径
35+
pp = requests.get(pic_link, headers=headers)
36+
pth = path + str(x) + ".png" # 设置图片名
37+
with open(pth, "wb") as f:
38+
for chunk in pp: # 读取每个图片链接的二进制数据
39+
f.write(chunk) # 写入
40+
print("第%s张下载好" % x)
41+
42+
if __name__ == '__main__':
43+
main()

0 commit comments

Comments
 (0)