使用python爬取携程网旅游信息（包含景点、酒店、美食）

阅读：评论：0

使⽤python爬取携程⽹旅游信息（包含景点、酒店、美⾷）其中本次爬⾍的主要思想是：⾸先是到携程⽹url的编写规律，然后根据规律使⽤beautifulsoup4对所需的html语⾔中的信息提取，最后就是封装处理。爬取的信息只是⽤来本次毕设的研究⾮商业⽤途。对于毕设的相关总结在：

如下是我爬取美⾷的代码：

# -*- coding: utf-8 -*-

import requests

import io

from bs4 import BeautifulSoup as BS

import time

import re

"""从⽹上爬取数据"""

headers = {

"Origin": "",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",

}

places=["beijing1","shanghai2","changsha148","sanya61","chongqing158","hongkong38","chengdu104","haerbin151",

"xian7","guangzhou152","hangzhou14"]

placenames=["北京","上海","长沙","三亚","重庆","⾹港","成都","哈尔滨","西安","⼴州","杭州"]美的集团股票

places=["changsha148"]

广州白水寨门票价格placenames=["长沙"]

base="/fooditem/";

base2="";

requestlist=[]

for j in range(len(places)): #爬取对应的特⾊菜

requestlist.append({"url":base+places[j]+".html","place":placenames[j]})

for i in range(2,2):

tmp=base+places[j]+"/s0-p"+str(i)+".html"

requestlist.append({"url":tmp,"place":placenames[j]});

#对应的url地址和所查询的位置

print(requestlist)

l=[]

count=1;

for i in range(len(requestlist)):

response = (requestlist[i]["url"], headers=headers)

#print(response)

#print(html)

soup=BS(html,'html.parser')

vs=soup.find_all(name="div",attrs={"class":"rdetailbox"})

print("len(vs)",len(vs))

for j in range(len(vs)):

print("正在打印的条数:",j)

try:

#获取⼦⽹页链接地址

href=vs[j].find(name="a",attrs={"target":"_blank"}).attrs["href"];

#print("href",href)

# 再次请求⼦⽹页，获取景点详细信息

res = (base2+href, headers=headers)

print("当前访问的⽹址：",base2+href)

with open("3.html","w",encoding="utf-8") as f:

f.)

soupi = ,"html.parser") #该⽹页的html代码

#print(soupi)

vis = soupi.find_all(name="li",attrs={"class":"infotext"}); #获取此时的dom⽂件位置所在

#print(vis)

introduce=[]

for i in range(len(vis)):

introduce.append(vis[i].get_text())

imgs=[];

imglinks=soupi.find_all(name="a",attrs={"href":"javascript:void(0)"})

#print(imte)

# print(imglinks)

# print(type(imglinks))

#for img in imte:

#imgs.append(img.attrs["src"])

tmp={};

tmp["id"]=count;

tmp["name"]=vs[j].find(name="a",attrs={"target":"_blank"}).string;

tmp["name"]=tmp["name"].replace(" ","").replace("\n","");

tmp["introduce"]=introduce

tmp["img"]=imglinks

tmp["city"]=requestlist[i]["place"]

count=count+1;

l.append(tmp);

time.sleep(1);

except Exception as e:

print(e)

pass

#print ("打印tmp",tmp)

# with open("datap/"+tmp["name"]+".pk",'wb') as f:

# pickle.dump(tmp,f);

with io.open("/Users/hujinhong/PycharmProjects/untitled5/food/changsha/"+tmp["name"]+".txt",'w',encoding="utf-8") as f:

f.write(str(tmp))

#print(l)

for i in l:

print((i))

成功的爬取如下数据：

爬取携程⽹景点代码如下：

# -*- coding: utf-8 -*-

import requests

import io

from bs4 import BeautifulSoup as BS

import time

"""从⽹上爬取数据"""

headers = {

"Origin": "",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", }

places=["beijing1","shanghai2","changsha148","sanya61","chongqing158","hongkong38","chengdu104","haerbin151",

"xian7","guangzhou152","hangzhou14"]

placenames=["北京","上海","长沙","三亚","重庆","⾹港","成都","哈尔滨","西安","⼴州","杭州"]

places=["beijing1"]

placenames=["北京"]

city="beijing"

base="/sight/";

base2="";

requestlist=[]

for j in range(len(places)): #⼀个景区爬10页

requestlist.append({"url":base+places[j]+".html","place":placenames[j]})

for i in range(2,4):

tmp=base+places[j]+"/s0-p"+str(i)+".html"

requestlist.append({"url":tmp,"place":placenames[j]});

print(requestlist)

l=[]

count=1;

for i in range(len(requestlist)):

response = (requestlist[i]["url"], headers=headers)

soup=BS(html,'html.parser')

vs=soup.find_all(name="div",attrs={"class":"rdetailbox"})

print(len(vs))

for j in range(len(vs)):

print(j)

try:

#获取⼦⽹页链接地址

href=vs[j].find(name="a",attrs={"target":"_blank"}).attrs["href"];

# 再次请求⼦⽹页，获取景点详细信息

res = (base2+href, headers=headers)

print(base2+href)

with open("3.html","w",encoding="utf-8") as f:

f.)

soupi = ,"html.parser")

vis = soupi.find_all(name="div",attrs={"class":"text_style"});

introduce=[]

for i in range(len(vis)):

introduce.append(vis[i].get_text())

imgs=[];

imglinks=soupi.find_all(name="img",attrs={"width":"350"})

#print(imglinks)

for img in imglinks:

imgs.append(img.attrs["src"])

score=soupi.find(name="span",attrs={"class":"score"}).b.get_text()

scores=[];

scores.append(score);中国最大豪华游轮

scorelinks=soupi.find(name="dl",attrs={"class":"comment_show"}).find_all(name="dd")

for link in scorelinks:

scores.append(link.find(name="span",attrs={"class":"score"}).string)

comments=[];

commentlinks=soupi.find_all(name="span",attrs={"class":"heightbox"});

for link in commentlinks:

平潭一日游最佳路线平潭岛comments._text())

tmp={};

tmp["id"]=count;

tmp["name"]=vs[j].find(name="a",attrs={"target":"_blank"}).string;

tmp["name"]=tmp["name"].replace(" ","").replace("\n","");

tmp["introduce"]=introduce

tmp["score"]=scores;

tmp["position"]=vs[j].find_all(name="dd",attrs={"class":"ellipsis"})[0].string;

tmp["position"]=tmp["position"].replace(" ","").replace("\n","");

tmp["img"]=imgs

tmp["city"]=city

tmp["grade"]=soupi.find_all(name="span", attrs={"class": "s_sight_con"})[0].get_text()

tmp["grade"]=tmp["grade"].replace(" ","").replace("\n","")

#tmp["fujin"]=soupi.find_all(name="a", attrs={"class": "item"})

count=count+1;

l.append(tmp);

time.sleep(1);

except Exception as e:

print(e)

pass

print ("打印tmp",tmp)

# with open("datap/"+tmp["name"]+".pk",'wb') as f:

# pickle.dump(tmp,f);

with io.open("/Users/hujinhong/PycharmProjects/untitled5/jingdian/beijing/"+tmp["name"]+".txt",'w',encoding="utf-8") as f:

f.write(str(tmp))

print(l)

# # browser.close()#关闭浏览器

# with open("",'w',encoding='utf-8') as f:

# f.write(str(l))

# with open("data2.pk","w",encoding="utf-8") as f:

# pickle.dump(l,f);

#/hotel/qingdao7/star2/k1%E4%BA%94%E5%9B%9B%E5%B9%BF%E5%9C%BA#ctm_ref=ctr_hp_sb_lst 成功爬取到携程⽹的景点，截图如下：

爬取酒店信息代码

# -*- coding: utf-8 -*-

import requests

import io

from bs4 import BeautifulSoup as BS

import time

三峡工程死亡多少人

"""从⽹上爬取数据"""

headers = {

"Origin": "",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", }

places=["beijing1","shanghai2","Changsha206","sanya61","chongqing158","hongkong38","chengdu104","haerbin151",

"xian7","guangzhou152","Hangzhou17"]

placenames=["北京","上海","长沙","三亚","重庆","⾹港","成都","哈尔滨","西安","⼴州","杭州"]

places=["Hangzhou17"]

placenames=["杭州"]

沈阳金桥国际旅行社numid=17

base="/hotel/";

base2="";

requestlist=[]

for j in range(len(places)): #爬取对应的特⾊菜

requestlist.append({"url":base+places[j]+".html","place":placenames[j]})

for i in range(2,4):

tmp=base+places[j]+"/s0-p"+str(i)+".html"

requestlist.append({"url":tmp,"place":placenames[j]});

#对应的url地址和所查询的位置

print(requestlist)

l=[]

count=1;

for i in range(len(requestlist)):

response = (requestlist[i]["url"], headers=headers)

#print(response)

#print(html)

soup=BS(html,'html.parser')

print(soup)

vs=soup.find_all(name="div",attrs={"class":"hotel_new_list"})

print("len(vs)",vs)

for j in range(len(vs)):

print("正在打印的条数:",j)

try:

daid=vs[j].find(name="h2",attrs={"class":"hotel_name"}).attrs["data-id"]

#ss=vs[j].find(name="a",attrs={"data-dopost":"T"}).attrs["title"]

#print("ss",ss)

#print(type(daid))

#print(type(j))

#获取⼦⽹页链接地址

href1="/hotel/"+daid+".html?isFull=F"

print(daid)

href=href1+"&masterhotelid="+daid+"&hcityid="+str(numid)+"#ctm_ref=hod_sr_lst_dl_n_2_"+str(j+1);

print("href",href)

# 再次请求⼦⽹页，获取景点详细信息

res = (href, headers=headers)

#print("当前访问的⽹址：",base2+href)

with open("3.html","w",encoding="utf-8") as f:

f.)

soupi = ,"html.parser") #该⽹页的html代码

#print(soupi)

vis = soupi.find_all(name="div",attrs={"class":"hotel_info_comment"}); #获取此时的dom⽂件位置所在

#print(vis)

introduce=[]

for i in range(len(vis)):

introduce.append(vis[i].get_text())

imgs=[];

imglinks=soupi.find(name="div",attrs={"data-index":"0"}).attrs["_src"];

print(type(soupi.find(name="div",attrs={"data-index":"0"})))

#print(soupi)

#print(imte)

print(imglinks)

本文发布于:2023-05-28 14:43:33，感谢您对本站的认可！

本文链接：http://www.035400.com/whly/4/146057.html

上一篇：分析携程与去哪里的盈利模式的共同与不同

下一篇：携程网的优势总结两篇

标签：爬取景点携程毕设信息

留言与评论（共有 0 条评论）