python + selenium +pyquery 爬虫 爬取 1688详情图片 阿里巴巴详情图片 与标题 图片并进行压缩 仅供学习交流使用 .zip
python + selenium +pyquery 爬虫 爬取 1688详情图片 阿里巴巴详情图片 与标题 下载图片并进行压缩 仅供学习交流使用
代码片段和文件信息
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 15275600 2019-07-15 19:49 chromedriver_mac_64
文件 8543232 2019-07-15 20:29 chromedriver.exe
文件 11061936 2019-07-16 10:09 chromedriver_linux64
文件 5546 2019-08-17 23:12 seleniumDemo_back.py
# -*- coding: utf-8 -*
import datetime
import os
import random
import re
import time
import Image
import requests
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver import ActionChains
from selenium.webdriver.common import keys
parentPath =“/Users/niubilea/Documents/ag/ali_goods/%s/“
downloadPath = parentPath+“download“
compressPath = parentPath+“compress“
def bluePrint(str):
print(‘ 33[1;34m‘ + str + ‘ 33[0m‘)
def redPrint(str):
print(‘ 33[1;31;40m‘ + str + ‘ 33[0m‘)
def openUrl(url):
browser = webdriver.Chrome(“./chromedriver_mac_64“)
browser.get(url)
top = 1000;
distance=100;
for i in range (130):
print(i)
top=top+i*distance
js=“var q=document.documentElement.scrollTop=“+str(top)
browser.execute_script(js)
time.sleep(random.random())
time.sleep(3)
return browserbrowser.page_source
def getPageHtml(pageUrl):
print(“开始获取html内容“)
headers = {‘Content-type‘: ‘text/html‘
‘User-Agent‘: ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100201 Firefox/22.0‘}
content = requests.get(pageUrltimeout=30headers=headers)
# content.encoding =‘utf-8‘;
content.encoding =‘gbk‘;
htmlsub = content.text
print(“获取内容完成“)
return htmlsub;
#创建文件
#file_path:文件路径
#msg:即要写入的内容
def create__file(file_pathmsg):
f=open(file_path“w“)
f.write(msg)
f.close
def download_banner(titlecontentdownloadFoldercompressFolder):
pic_url = re.findall(‘src=“(https://cbu01.*jpg“)‘ content)
i = 0;
for key in pic_url:
time.sleep(0.3)
i = i + 1
temptitle = title + str(i)
targetImgPath = downloadFolder + ‘/%s.jpg‘ % temptitle
print(key + “
“)
from urllib import urlretrieve
try:
if key.find(“https“) >= 0:
urlretrieve(key targetImgPath)
else:
urlretrieve(“https:“ + key targetImgPath)
except Exception as e:
print(e)
tinypng(downloadFoldercompressFolder)
def download_content(titlecontentdownloadFoldercompressFolder):
pic_url = re.findall(‘img src=“(.*?)“‘ content)
i = 0;
for key in pic_url:
time.sleep(0.03)
i = i + 1
temptitle = title + str(i)
targetImgPath = downloadFolder + ‘/%s.jpg‘ % temptitle
print(key + “
“)
from urllib import urlretrieve
try:
if key.find(“https“) >= 0:
urlretrieve(key targetImgPath)
else:
urlretrieve(“https:“ + key targetImgPath)
except Exception as e:
print(e)
tinypng(downloadFoldercompressFolder)
def tinypng(downloadcompress):
# 指定要压缩的文件夹
srcPath =download
# 压缩后文件夹
dstPath = compress
for filename in os.listdir(srcPath):
# 如果不存在目的目录则创建一个,保持层级结构
if not os.path.exists(dstPath):
os.makedirs(dstPath)
# 拼接完整的文件或文
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 15275600 2019-07-15 19:49 chromedriver_mac_64
文件 8543232 2019-07-15 20:29 chromedriver.exe
文件 11061936 2019-07-16 10:09 chromedriver_linux64
文件 5546 2019-08-17 23:12 seleniumDemo_back.py
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容, 请发送邮件举报,一经查实,本站将立刻删除。
评论列表(条)