最近需要一些东南亚国家的照片,于是写了一个穷游网游记的爬虫,穷游网用了ajax请求,简单的request很难实现拿到图片的url,于是用了selenium进行爬取。
from selenium import webdriver
import time,re,requests,os
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
def get_img_in_page(page_url='https://bbs.qyer.com/thread-3502079-1.html',wait_time=2):
'''
从详情页爬取图片,返回图片的url列表
:param page_url:详情页的url
:param wait_time:等待网页加载的时间
:return:图片的url列表
'''
#创建浏览器的隐式启动,也可以直接用webdriver.Chrome()显式启动
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = 'D:\ProgramData\Anaconda3\envs\spider\Scripts\chromedriver.exe' #这个地方要改正自己电脑上安装chromedriver时的存放位置
browser = webdriver.Chrome(executable_path=path, options=chrome_options) # 选择谷歌浏览器
# browser = webdriver.Chrome()
browser.get(page_url)
time.sleep(wait_time) #等待1s,等网页加载完成
soup=BeautifulSoup(browser.page_source,'lxml')
browser.close() #关闭网页
b_list=soup.select('.flo_content') #使用bs4,根据class查找内容位置
img_list=re.findall('data-original="(.*?)"',str(b_list)) #正则表达式,找原图的url
img_url_list=[]
for i in img_list:
img_url='https:'+i
img_url_list.append(img_url)
return img_url_list
def get_page_list(keyword='东南亚',page='1'):
'''
根据目的地关键词检索相关帖子,返回每个帖子对应的详情页url列表
:param keyword:搜索关键词
:param page:第几页(穷游网一般最大是100)
:return:详情页的url列表
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Connection':'close'
}
url_0='https://search.qyer.com/newBbs'
params={
'ajaxID': '5be149fd7b20e5c1476b7117',
'keyword': keyword,
'tab': 'bbs',
'page': page,
'post_time_mode': '1',
'post_time_start':'',
'post_time_end':'',
'post_order': '1',
'post_type': '0'
}
response=requests.get(url=url_0,headers=headers,params=params) #直接用request根据搜索关键字对网页发出请求
html_data=response.json()
detail_page_list=[]
for i in html_data['data']['data']['list']:
detail_page_url='https:'+i['t_url']
detail_page_list.append(detail_page_url)
return detail_page_list
if __name__ == '__main__':
keyword_list=['缅甸','泰国','柬埔寨','老挝','越南','菲律宾','马来西亚','文莱','印度尼西亚','东帝汶']
for keyword in keyword_list:
page=101
name_num = 0
for page_num in range(1,page):
try:
page_num_=str(page_num)
detail_page_list=get_page_list(keyword=keyword,page=page_num_)
save_root='./img/'+keyword+'/' #存储文件夹
if not os.path.exists(save_root):
os.mkdir(save_root)
detail_num=0
# print('-----------------',page_num)
# print(detail_page_list)
for detail_url in detail_page_list:
detail_num+=1
try:
img_url_list=get_img_in_page(page_url=detail_url,wait_time=1)
img_num=0
for img_url in img_url_list:
try:
img_num+=1
name_num+=1
img_name=save_root+keyword+'_'+str(name_num)+'.jpg'
img_data = requests.get(img_url).content
with open(img_name, 'wb') as fp:
fp.write(img_data)
print("[%s 第%s页] [第%s篇游记] [第%s/%s张图] [存储地址:%s]"%(keyword,str(page_num),detail_num,img_num,len(img_url_list),img_name))
time.sleep(0.1)
except:
continue
except:
continue
except:
continue
因篇幅问题不能全部显示,请点此查看更多更全内容
Copyright © 2019- sarr.cn 版权所有 赣ICP备2024042794号-1
违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com
本站由北京市万商天勤律师事务所王兴未律师提供法律服务