获取马蜂窝景点数据和景点评论数据

阅读: 评论:0

获取马蜂窝景点数据和景点评论数据
CREATE TABLE`poi`(
`poi_id` int NOT NULL,
`name`varchar(128)DEFAULT NULL,
`image`varchar(512)DEFAULT NULL,
`link`varchar(512)DEFAULT NULL,
`lat` float DEFAULT NULL,
`lng` float DEFAULT NULL,
`type` int DEFAULT NULL,
`is_cnmain` int DEFAULT NULL,
`country_mddid` int DEFAULT NULL,
PRIMARY KEY(`poi_id`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci ;
poi_commnet |CREATE TABLE`poi_commnet`(
`poi_id` int NOT NULL,
`name`varchar(128)DEFAULT NULL,
`date`varchar(128)DEFAULT NULL,
`star`varchar(256)DEFAULT NULL,
`comment` text
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
poi_detail |CREATE TABLE`poi_detail`(
`poi_id` int NOT NULL,
`name`varchar(128)DEFAULT NULL,
`mdd`varchar(128)DEFAULT NULL,
`enName`varchar(256)DEFAULT NULL,
`commentCount`varchar(128)DEFAULT NULL,
`description` text,
`tel`varchar(128)DEFAULT NULL,
`site`varchar(256)DEFAULT NULL,
`time`varchar(128)DEFAULT NULL,
`traffic` text,
`ticket` text,
`openingTime` text,
`location`varchar(256)DEFAULT NULL,
PRIMARY KEY(`poi_id`),
CONSTRAINT`poi_id`FOREIGN KEY(`poi_id`)REFERENCES`poi`(`poi_id`)ON DELETE CASCADE ON UPDATE CASCADE )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
CREATE TABLE`poi_1`(
`poi_id` int NOT NULL,
`name`varchar(128)DEFAULT NULL,
`image`varchar(512)DEFAULT NULL,
`link`varchar(512)DEFAULT NULL,
`lat` float DEFAULT NULL,
`lng` float DEFAULT NULL,
`type` int DEFAULT NULL,
`is_cnmain` int DEFAULT NULL,
`country_mddid` int DEFAULT NULL
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
// An highlighted block
import requests
import re
import time
import json
import hashlib
import logging
import threading
import pymysql
from bs4 import BeautifulSoup
import xlwt
import os
import  math
comment_url='pagelet.mafengwo/poi/pagelet/poiCommentListApi?'
requests_headers={
'Referer':'www.mafengwo/poi/12913.html',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
data_list =[]
class MafengwoCrawler:
# 查询⽬的地的⽹址
# ⽬的地内包含景点
URL_MDD='www.mafengwo/mdd/'
# 查询景点的⽹址
# 包含景点详情的链接、景点图⽚和景点名称
URL_ROUTE='www.mafengwo/ajax/router.php'
# 查询景点坐标经纬度的⽹址
# 经度:longitude lng
# 纬度:latitude lat
URL_POI='pagelet.mafengwo/poi/pagelet/poiLocationApi'
# 查询景点评论的⽹址
URL_COM='pagelet.mafengwo/poi/pagelet/poiCommentListApi'
# 通⽤ Headers
HEADERS={
'Referer':'www.mafengwo/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36', }
# mysql 数据库链接信息
DB_HOST='localhost'
DB_USER='root'
DB_PASSWORD='123456'
DB_NAME='mafengwo'
# 请求数据加密需要的字符串,由_get_md5_encrypted_string()⽅法获取
encrypted_string =''
# 记录不⽤爬取的页码,即爬取成功的页码
success_pages =[]
def __init__(self, log_file=None):
# 使⽤说明 wwwblogs/nancyzhu/p/8551506.html
logging.basicConfig(level=logging.DEBUG,
filename='mafengwo.'+str(int(time.time()))+'.log',
format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s')
# 初始化请求对象
self.REQ= requests.session()
# 设置通⽤ Headers
self.REQ.headers.update(self.HEADERS)
# 获取请求数据加密需要的字符串
self._get_md5_encrypted_string()
# 如果传⼊⽇志⽂件,则过滤已爬取成功的页码
if log_file is not None:
self.success_pages = self._read_log_file_get_success_page(log_file)
print('当前已经成功爬取的页数:'+str(len(self.success_pages)))
print('5秒后继续运⾏')
time.sleep(5)
def    crawler_mdd(self, mdd_id=21536):
'''
爬取单个⽬的地的景点信息
默认:21536,中国
'''
# mdd_id =12522  # ⿎浪屿,16页,测试数据
凤凰古城还收门票吗print(mdd_id)
# 开始爬数据
start =int(time.time())
# 先获取数据总页数
res = self._get_route(mdd_id)
page_total = res['pagecount']
# 计算每个线程爬取多少页
page_range =round(page_total/20)
if page_range ==0:
page_range =1
logging.info('总共'+str(page_total)+'页,每个线程爬取'+str(page_range)+'页') print('总共'+str(page_total)+'页,每个线程爬取'+str(page_range)+'页')
# 开启多线程模式
thread =[]
for i in range(1, page_total+1, page_range):
page_start = i
page_end = i + page_range
if page_end > page_total +1:
page_end = page_total +1
t = threading.Thread(awler,
天津极地海洋世界门票价格
args=(mdd_id, page_start, page_end))
thread.append(t)
for i in range(0,len(thread)):
thread[i].start()
for i in range(0,len(thread)):
thread[i].join()
end =int(time.time())
logging.info('总共花费:'+str(end-start)+'秒')
print('总共花费:'+str(end-start)+'秒')
def crawler(self, mdd_id, start_page, end_page):
'''
真正的爬⾍
是时候展⽰真正的实⼒了
'''
# 连接数据库
db = t(
host=self.DB_HOST,
port=3306,
user=self.DB_USER,
passwd=self.DB_PASSWORD,
db=self.DB_NAME)
for page in range(start_page, end_page):
if page in self.success_pages:
print('跳过:'+str(page))
continue
page_pass = False
page_retry =0
while not page_pass and page_retry <11:
try:
print('当前爬取页数:'+str(page))
print('当前爬取页数:'+str(page))
result = self._get_route(mdd_id, page=page)['list']
# 存数据库
喀什旅游攻略必玩的景点
sql ="INSERT IGNORE INTO poi(poi_id, poi_name, image, link, lat, lng, type, is_cnmain, country_id, mdd_id, mdd_name) \                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
params =[]
use_dict ={41249:"抚州",22762:"鹰潭",15219:"上饶",12651:"衢州",10124:"丽⽔",
12681:"宁德",11498:"福州",11784:"莆⽥",14844:"三明",12013:"南平",}
for item in result:
params.append((
item['poi_id'],
巴萨
item['name'],
item['image'],
item['link'],
item['lat'],
item['lng'],
item['type'],
item['is_cnmain'],
item['country_mddid'],八月份最适合旅游的地方
mdd_id,
use_dict[mdd_id]
))
try:
cursor = db.cursor()
dbmit()
# 成功
logging.info('page success: '+str(page))
print('page success: '+str(page))
page_pass = True
except Exception as e:
<(e)
# 如果发⽣错误则回滚
except Exception as e:
page_retry +=1
<(e)
<(result)
# 关闭数据库
db.close()
def crawler_detail(self):
'''
爬取景点详细信息到数据库
执⾏这个⽅法之前,需要先爬取好数据到 poi 数据表
多线程爬取 crawler_detail_worker
'''
# 查询 poi 数据表中的数据条数
db = t(
host=self.DB_HOST,
port=3306,
user=self.DB_USER,
passwd=self.DB_PASSWORD,
db=self.DB_NAME)
sql ='SELECT COUNT(*) as total from poi;'
cursor = db.cursor()
result = cursor.fetchall()
# 总数据条数
total = result[0][0]
db.close()
# 开始爬数据
start =int(time.time())
# 先获取总数据条数
# 先获取总数据条数
total = result[0][0]
# 计算每个线程爬取多少条
range_count =round(total/20)
if range_count ==0:
range_count =1
# ⽇志
logging.info('总共'+str(total)+'条数据,每个线程爬取'+str(range_count)+'条') print('总共'+str(total)+'条数据,每个线程爬取'+str(range_count)+'条')
# 开启多线程模式
thread =[]
for i in range(0, total, range_count):
# i, range_count SQL查询起始位置,查询数量
t = threading.Thread(awler_detail_worker,
args=(i, range_count))
thread.append(t)
for i in range(0,len(thread)):
thread[i].start()
for i in range(0,len(thread)):
thread[i].join()
华容天气预报end =int(time.time())
logging.info('总共花费:'+str(end-start)+'秒')
print('总共花费:'+str(end-start)+'秒')
return
def crawler_detail_worker(self, offset, limit):
'''⼯作线程'''
db = t(
host=self.DB_HOST,
port=3306,
user=self.DB_USER,
passwd=self.DB_PASSWORD,
db=self.DB_NAME)
sql ='SELECT poi_id, name, link FROM poi ORDER BY poi_id LIMIT '+ \ str(offset)+', '+str(limit)+';'
cursor = db.cursor()
# 查询结果集
result = cursor.fetchall()
detail_list =[]
c_count =0
save_count =100  # 多少条数据保存⼀次数据库,默认100
for item in result:
poi_id = item[0]
name = item[1]
link = item[2]
# 爬取之前先查询⼀下是否有相应数据
sql_select ='SELECT poi_id FROM poi_detail WHERE poi_id='+ \ str(poi_id)+';'
result_select = cursor.fetchall()
# 如果已经爬取过,则跳过
if len(result_select)!=0 and len(detail_list)!= c_count:
continue
# 如果没有获取过,则爬取数据
poi_detail = self._get_poi_detail(link)
# 将爬取到的信息暂存
poi_detail['name']= name
poi_detail['poi_id']= poi_id
detail_list.append(poi_detail)
logging.info('详情爬取成功 '+str(poi_id)+' '+ name)

本文发布于:2023-08-07 00:30:50,感谢您对本站的认可!

本文链接:http://www.035400.com/whly/4/204107.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:数据   景点   爬取   查询   数据库   获取   需要
留言与评论(共有 0 条评论)
   
验证码:
推荐文章
排行榜
Copyright ©2024-2030 Comsenz Inc.Powered by © 文化旅游网 滇ICP备2022007236号-403 联系QQ:1103060800网站地图