使用python爬取携程网旅游信息(包含景点、酒店、美食)

阅读: 评论:0

使⽤python爬取携程⽹旅游信息(包含景点、酒店、美⾷)其中本次爬⾍的主要思想是:⾸先是到携程⽹url的编写规律,然后根据规律使⽤beautifulsoup4对所需的html语⾔中的信息提取,最后就是封装处理。爬取的信息只是⽤来本次毕设的研究⾮商业⽤途。对于毕设的相关总结在:
如下是我爬取美⾷的代码:
# -*- coding: utf-8 -*-
import requests
import io
from bs4 import BeautifulSoup as BS
import time
import re
"""从⽹上爬取数据"""
headers = {
"Origin": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
}
places=["beijing1","shanghai2","changsha148","sanya61","chongqing158","hongkong38","chengdu104","haerbin151",
"xian7","guangzhou152","hangzhou14"]
placenames=["北京","上海","长沙","三亚","重庆","⾹港","成都","哈尔滨","西安","⼴州","杭州"]美的集团股票
places=["changsha148"]
广州白水寨门票价格placenames=["长沙"]
base="/fooditem/";
base2="";
requestlist=[]
for j in range(len(places)):  #爬取对应的特⾊菜
requestlist.append({"url":base+places[j]+".html","place":placenames[j]})
for i in range(2,2):
tmp=base+places[j]+"/s0-p"+str(i)+".html"
requestlist.append({"url":tmp,"place":placenames[j]});
#对应的url地址和所查询的位置
print(requestlist)
l=[]
count=1;
for i in range(len(requestlist)):
response = (requestlist[i]["url"], headers=headers)
#print(response)
#print(html)
soup=BS(html,'html.parser')
vs=soup.find_all(name="div",attrs={"class":"rdetailbox"})
print("len(vs)",len(vs))
for j in range(len(vs)):
print("正在打印的条数:",j)
try:
#获取⼦⽹页链接地址
href=vs[j].find(name="a",attrs={"target":"_blank"}).attrs["href"];
#print("href",href)
# 再次请求⼦⽹页,获取景点详细信息
res = (base2+href, headers=headers)
print("当前访问的⽹址:",base2+href)
with open("3.html","w",encoding="utf-8") as f:
f.)
soupi = ,"html.parser") #该⽹页的html代码
#print(soupi)
vis = soupi.find_all(name="li",attrs={"class":"infotext"}); #获取此时的dom⽂件位置所在
#print(vis)
#print(vis)
introduce=[]
for i in range(len(vis)):
introduce.append(vis[i].get_text())
imgs=[];
imglinks=soupi.find_all(name="a",attrs={"href":"javascript:void(0)"})
#print(imte)
# print(imglinks)
# print(type(imglinks))
#for img in imte:
#imgs.append(img.attrs["src"])
tmp={};
tmp["id"]=count;
tmp["name"]=vs[j].find(name="a",attrs={"target":"_blank"}).string;
tmp["name"]=tmp["name"].replace(" ","").replace("\n","");
tmp["introduce"]=introduce
tmp["img"]=imglinks
tmp["city"]=requestlist[i]["place"]
count=count+1;
l.append(tmp);
time.sleep(1);
except Exception as e:
print(e)
pass
#print ("打印tmp",tmp)
# with open("datap/"+tmp["name"]+".pk",'wb') as f:
#  pickle.dump(tmp,f);
with io.open("/Users/hujinhong/PycharmProjects/untitled5/food/changsha/"+tmp["name"]+".txt",'w',encoding="utf-8") as f:
f.write(str(tmp))
#print(l)
for i in l:
print((i))
成功的爬取如下数据:
爬取携程⽹景点代码如下:
# -*- coding: utf-8 -*-
import requests
import io
from bs4 import BeautifulSoup as BS
import time
"""从⽹上爬取数据"""
headers = {
"Origin": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", }
places=["beijing1","shanghai2","changsha148","sanya61","chongqing158","hongkong38","chengdu104","haerbin151",
"xian7","guangzhou152","hangzhou14"]
placenames=["北京","上海","长沙","三亚","重庆","⾹港","成都","哈尔滨","西安","⼴州","杭州"]
places=["beijing1"]
placenames=["北京"]
city="beijing"
base="/sight/";
base2="";
requestlist=[]
for j in range(len(places)):  #⼀个景区爬10页
requestlist.append({"url":base+places[j]+".html","place":placenames[j]})
for i in range(2,4):
tmp=base+places[j]+"/s0-p"+str(i)+".html"
requestlist.append({"url":tmp,"place":placenames[j]});
print(requestlist)
l=[]
count=1;
for i in range(len(requestlist)):
response = (requestlist[i]["url"], headers=headers)
soup=BS(html,'html.parser')
vs=soup.find_all(name="div",attrs={"class":"rdetailbox"})
print(len(vs))
for j in range(len(vs)):
print(j)
try:
#获取⼦⽹页链接地址
href=vs[j].find(name="a",attrs={"target":"_blank"}).attrs["href"];
# 再次请求⼦⽹页,获取景点详细信息
res = (base2+href, headers=headers)
print(base2+href)
with open("3.html","w",encoding="utf-8") as f:
f.)
soupi = ,"html.parser")
vis = soupi.find_all(name="div",attrs={"class":"text_style"});
introduce=[]
for i in range(len(vis)):
introduce.append(vis[i].get_text())
imgs=[];
imglinks=soupi.find_all(name="img",attrs={"width":"350"})
#print(imglinks)
for img in imglinks:
imgs.append(img.attrs["src"])
score=soupi.find(name="span",attrs={"class":"score"}).b.get_text()
scores=[];
scores.append(score);中国最大豪华游轮
scorelinks=soupi.find(name="dl",attrs={"class":"comment_show"}).find_all(name="dd")
for link in scorelinks:
scores.append(link.find(name="span",attrs={"class":"score"}).string)
comments=[];
commentlinks=soupi.find_all(name="span",attrs={"class":"heightbox"});
for link in commentlinks:
平潭一日游最佳路线平潭岛comments._text())
tmp={};
tmp["id"]=count;
tmp["name"]=vs[j].find(name="a",attrs={"target":"_blank"}).string;
tmp["name"]=tmp["name"].replace(" ","").replace("\n","");
tmp["introduce"]=introduce
tmp["score"]=scores;
tmp["position"]=vs[j].find_all(name="dd",attrs={"class":"ellipsis"})[0].string;
tmp["position"]=tmp["position"].replace(" ","").replace("\n","");
tmp["img"]=imgs
tmp["img"]=imgs
tmp["city"]=city
tmp["grade"]=soupi.find_all(name="span", attrs={"class": "s_sight_con"})[0].get_text()
tmp["grade"]=tmp["grade"].replace(" ","").replace("\n","")
#tmp["fujin"]=soupi.find_all(name="a", attrs={"class": "item"})
count=count+1;
l.append(tmp);
time.sleep(1);
except Exception as e:
print(e)
pass
print ("打印tmp",tmp)
# with open("datap/"+tmp["name"]+".pk",'wb') as f:
#  pickle.dump(tmp,f);
with io.open("/Users/hujinhong/PycharmProjects/untitled5/jingdian/beijing/"+tmp["name"]+".txt",'w',encoding="utf-8") as f:
f.write(str(tmp))
print(l)
# # browser.close()#关闭浏览器
# with open("",'w',encoding='utf-8') as f:
#  f.write(str(l))
# with open("data2.pk","w",encoding="utf-8") as f:
#  pickle.dump(l,f);
#/hotel/qingdao7/star2/k1%E4%BA%94%E5%9B%9B%E5%B9%BF%E5%9C%BA#ctm_ref=ctr_hp_sb_lst 成功爬取到携程⽹的景点,截图如下:
爬取酒店信息代码
# -*- coding: utf-8 -*-
import requests
import io
from bs4 import BeautifulSoup as BS
import time
三峡工程死亡多少人
"""从⽹上爬取数据"""
headers = {
headers = {
"Origin": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", }
places=["beijing1","shanghai2","Changsha206","sanya61","chongqing158","hongkong38","chengdu104","haerbin151",
"xian7","guangzhou152","Hangzhou17"]
placenames=["北京","上海","长沙","三亚","重庆","⾹港","成都","哈尔滨","西安","⼴州","杭州"]
places=["Hangzhou17"]
placenames=["杭州"]
沈阳金桥国际旅行社numid=17
base="/hotel/";
base2="";
requestlist=[]
for j in range(len(places)):  #爬取对应的特⾊菜
requestlist.append({"url":base+places[j]+".html","place":placenames[j]})
for i in range(2,4):
tmp=base+places[j]+"/s0-p"+str(i)+".html"
requestlist.append({"url":tmp,"place":placenames[j]});
#对应的url地址和所查询的位置
print(requestlist)
l=[]
count=1;
for i in range(len(requestlist)):
response = (requestlist[i]["url"], headers=headers)
#print(response)
#print(html)
soup=BS(html,'html.parser')
print(soup)
vs=soup.find_all(name="div",attrs={"class":"hotel_new_list"})
print("len(vs)",vs)
for j in range(len(vs)):
print("正在打印的条数:",j)
try:
daid=vs[j].find(name="h2",attrs={"class":"hotel_name"}).attrs["data-id"]
#ss=vs[j].find(name="a",attrs={"data-dopost":"T"}).attrs["title"]
#print("ss",ss)
#print(type(daid))
#print(type(j))
#获取⼦⽹页链接地址
href1="/hotel/"+daid+".html?isFull=F"
print(daid)
href=href1+"&masterhotelid="+daid+"&hcityid="+str(numid)+"#ctm_ref=hod_sr_lst_dl_n_2_"+str(j+1);
print("href",href)
# 再次请求⼦⽹页,获取景点详细信息
res = (href, headers=headers)
#print("当前访问的⽹址:",base2+href)
with open("3.html","w",encoding="utf-8") as f:
f.)
soupi = ,"html.parser") #该⽹页的html代码
#print(soupi)
vis = soupi.find_all(name="div",attrs={"class":"hotel_info_comment"}); #获取此时的dom⽂件位置所在
#print(vis)
introduce=[]
for i in range(len(vis)):
introduce.append(vis[i].get_text())
imgs=[];
imglinks=soupi.find(name="div",attrs={"data-index":"0"}).attrs["_src"];
print(type(soupi.find(name="div",attrs={"data-index":"0"})))
#print(soupi)
#print(imte)
print(imglinks)

本文发布于:2023-05-28 14:43:33,感谢您对本站的认可!

本文链接:http://www.035400.com/whly/4/146057.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:爬取   景点   携程   毕设   信息
留言与评论(共有 0 条评论)
   
验证码:
推荐文章
排行榜
Copyright ©2024-2030 Comsenz Inc.Powered by © 文化旅游网 滇ICP备2022007236号-403 联系QQ:1103060800网站地图