python 爬虫爬取简历
python 爬虫爬取站长之站的模板,需要的看一看,毕业了,需要模板
代码片段和文件信息
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 240993 2016-01-13 14:44 jianli87821.jpg
----------- --------- ---------- ----- ----
240993 1
# -*- conding:UTF-8 -*-
import requests
import re
class Resume(object):
def __init__(self):
self.headers = {
“User-Agent“: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/67.0.3396.99 Safari/537.36“
}
self.next_url_list = []
self.Download_list = []
def get_page(self url):
response = requests.get(url=url headers=self.headers)
if response.status_code == 200:
print(“请求成功“)
html = response.text
next_urls = re.findall(‘s‘ html)
for next_url in next_urls:
self.next_url_list.append(next_url)
def parse_page(self next_url):
response = requests.get(url=next_url headers=self.headers)
response.encoding = “UTF-8“ # 转码为中文
if response.status_code == 200:
print(“请求成功“)
html = response.text
Download_url = re.findall(“福建电信下载 “ html)
name = re.findall(‘title“: “(.*?)“‘ html)
print(‘----------------------------------------------------‘)
print(Download_url)
print(name)
print(‘----------------------------------------------------‘)
self.Download_list.append([Download_url[0] name[0]])
print(Download_url name)
def Download(self download_url name):
response = requests.get(url=download_url headers=self.headers)
if response.status_code == 200:
with open(“test/%s.rar“ % name ‘wb‘) as f:
f.write(response.content)
def main(self):
for page in range(2 3):
if page == 1:
url = “http://sc.chinaz.com/jianli/free.html“
else:
url = “http://sc.chinaz.com/jianli/free_%s.html“ % page
print(url)
self.get_page(url)
for next_url in self.next_url_list:
self.parse_page(next_url)
for Download_info in self.Download_list:
self.Download(Download_info[0] Download_info[1])
if __name__ == “__main__“:
resume = Resume()
resume.main()
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 240993 2016-01-13 14:44 jianli87821.jpg
----------- --------- ---------- ----- ----
240993 1
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件举报,一经查实,本站将立刻删除。
评论列表(条)