博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
驴妈妈旅游爬虫
阅读量:5825 次
发布时间:2019-06-18

本文共 12529 字,大约阅读时间需要 41 分钟。

概览页抓取链接

1 import requests  2 import re  3 import pymysql  4 import hashlib  5 import datetime  6   7   8 class Demo(object):  9     def __init__(self): 10         self.host = '127.0.0.1' 11         self.db = 'app_mark' 12         self.user = 'root' 13         self.passwd = '123456' 14         self.charset = 'utf8mb4' 15         self.headers = { 16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 17         } 18         self.url = 'http://www.lvmama.com/' 19         self.channel_link = [ 20             'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list',  # 海岛 21             'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 东南亚 22             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 中国港澳台 23             'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list',  # 迪拜 24             'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list',  # 俄罗斯 25             'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list',  # 越南 26             'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22',  # 法瑞意德 27             'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list',  # 巴厘岛 28             'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list',  # 日本 29             'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list',  # 欧美 30             'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list',  # 新加坡 31             'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list',  # 香港 32             'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list',  # 澳洲 33             'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list',  # 泰国 34             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list',  # 三亚 35             'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p2 36             'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p3 37             'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350',  # 三亚p4 38             'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list',  # 厦门 39             'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list',  # 广东 40             'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list',  # 云南 41             'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list',  # 上海 42             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list',  # 西安 43             'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list',  # 成都 44             'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list',  # 吉林 45             'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list',  # 西北 46             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list',  # 北京 47             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list',  # 山东 48             'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list',  # 山西 49             'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list',  # 河北 50             'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list',  # 辽宁 51             ] 52         self.channel_name = [ 53             '海岛', 54             '东南亚', 55             '中国港澳台', 56             '迪拜', 57             '俄罗斯', 58             '越南', 59             '法瑞意德', 60             '巴厘岛', 61             '日本', 62             '欧洲', 63             '新加坡', 64             '香港', 65             '澳洲', 66             '泰国', 67             '三亚', 68             '三亚p2', 69             '三亚p3', 70             '三亚p4', 71             '厦门', 72             '广东', 73             '云南', 74             '上海', 75             '西安', 76             '成都', 77             '吉林', 78             '西北', 79             '北京', 80             '山东', 81             '山西', 82             '河北', 83             '辽宁', 84         ] 85  86     def get_html(self, url): 87         response = requests.get(url, headers=self.headers) 88         response.encoding = response.apparent_encoding 89         html = response.text 90         return html 91  92     def get_data(self): 93         # 首页抓取 94         # html = self.get_html(self.url) 95         # datas = re.findall('
  • )', datas, re.S) 97 # for li in lis: 98 # # detail_url = re.findall('
  • .*?', divs, re.S)110 for div in divs:111 print(self.channel_name[index])112 url = re.findall('
  •  

    细览页解析字段

    1 import pymysql  2 import re  3 import requests  4 from multiprocessing.dummy import Pool as ThreadPool  5 import datetime  6   7   8 class XLY(object):  9     def __init__(self): 10         self.host = '127.0.0.1' 11         self.db = 'app_mark' 12         self.user = 'root' 13         self.passwd = '123456' 14         self.charset = 'utf8mb4' 15         self.headers = { 16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' 17         } 18         self.start = datetime.datetime.now() 19  20     def get_data(self): 21         # 从gly表中拿链接 22         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 23         cur = con.cursor() 24         sql = 'select link from gly where tag = "1" and sitename="驴妈妈旅游"' 25         after_sql = 'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"' 26         try: 27             cur.execute(sql) 28             results = cur.fetchall() 29             cur.execute(after_sql) 30         except Exception as e: 31             con.rollback() 32             results = None 33             print('error~', e) 34         else: 35             con.commit() 36         cur.close() 37         con.close() 38         return results 39  40     def parse_data(self, url): 41         # 正则匹配各个字段 42         print(url) 43         url = url[0] 44         # 匹配id 45         id = url.split('/')[-1] 46         id = re.sub('\?.*', '', id) 47         # print(id) 48         response = requests.get(url, headers=self.headers) 49         html = response.text 50         if 'scenic' not in url and 'hotel' not in url: 51             # 去掉酒店和景点 52             # 匹配标题 53             title = re.findall('
    (.*?)', html, re.S) 54 if title: 55 title = title[0] 56 title = re.sub('\n|\r| |自营|<[\s\S]*?>', '', title) 57 title = title.strip() 58 else: 59 title = re.findall('

    (.*?)

    ', html, re.S) 60 if title: 61 title = title[0] 62 title = re.sub('\n|\r| |自营|<[\s\S]*?>', '', title) 63 title = title.strip() 64 # 匹配价格 65 price = re.findall('
    (\d+)', html, re.S) 66 if price: 67 price = price[0] 68 else: 69 price = re.findall('
    .*?(\d+).*?', html, re.S) 70 if price: 71 price = price[0] 72 else: 73 price = re.findall('¥
    (\d+)', html, re.S) 74 if price: 75 price = price[0] 76 else: 77 price = re.findall('
    .*?(\d+).*?', html, re.S) 78 if price: 79 price = price[0] 80 else: 81 price = None 82 # 匹配好评率 83 praise = re.findall('

    [\s\S]*?([\s\S]*?)[\s\S]*?

    ', html, re.S) 84 if praise: 85 praise = praise[0] 86 praise = re.sub('<.*?>', '', praise) 87 praise = praise.strip() 88 else: 89 praise = re.findall('
    ([\s\S]*?)', html, re.S) 90 if praise: 91 praise = praise[0] 92 else: 93 praise = re.findall('([\s\S]*?)', html, re.S) 94 if praise: 95 praise = praise[0] 96 praise = praise.strip() 97 else: 98 praise = re.findall('

    [\s\S]*?([\s\S]*?)%[\s\S]*?', html, re.S) 99 if praise:100 praise = praise[0]101 praise = praise.strip()102 else:103 praise = re.findall('([\s\S]*?)', html, re.S)104 if praise:105 praise = praise[0]106 if praise:107 if '%' in praise:108 praise = re.sub('%', '', praise)109 praise = float(praise)110 if praise > 100:111 praise = None112 print('好评率抓取错误')113 else:114 pass115 else:116 praise = None117 # 匹配出发地118 starting_city = re.findall('

    [\s\S]*?出发城市[\s\S]*?
    ([\s\S]*?)
    ', html, re.S)119 target_city = re.findall('
    目的地[\s\S]*?
    ([\s\S]*?)
    ', html, re.S)120 if starting_city:121 starting_city = starting_city[0]122 starting_city = re.sub('<.*?>', '', starting_city)123 # 匹配目的地124 target_city = target_city[0]125 target_city = re.sub('<.*?>', '', target_city)126 # 匹配天数127 days_spent = re.findall('
    出游天数[\s\S]*?
    ([\s\S]*?)
    ', html, re.S)[0]128 days_spent = re.sub('<.*?>', '', days_spent)129 # print(days_spent)130 else:131 starting_city = target_city = days_spent = None132 # 匹配类型133 type_ = re.findall('
    ([\s\S]*?)', html, re.S)134 if type_:135 type_ = type_[0]136 else:137 type_ = re.findall('
    ([\s\S]*?)', html, re.S)138 if type_:139 type_ = type_[0]140 else:141 type_ = re.findall('
    ([\s\S]*?)', html, re.S)142 if type_:143 type_ = type_[0]144 else:145 type_ = None146 # print(type_)147 list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url]148 self.save_data(list_data)149 150 def save_data(self, list_data):151 # 写入数据库152 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)153 cur = con.cursor()154 sql = 'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'155 # cur.execute(sql, list_data)156 # con.commit()157 try:158 cur.execute(sql, list_data)159 print('insert success')160 except Exception as e:161 con.rollback()162 print('error~', e)163 else:164 con.commit()165 cur.close()166 con.close()167 168 169 if __name__ == '__main__':170 xly = XLY()171 urls = xly.get_data()172 if urls:173 # 开启多线程174 pool = ThreadPool(20)175 pool.map(xly.parse_data, urls)176 pool.close()177 pool.join()178 end = datetime.datetime.now()179 print('耗时:', (end-xly.start))180 # for url in urls:181 # url = url[0]182 # xly.parse_data(url)183 # break

     

    转载于:https://www.cnblogs.com/MC-Curry/p/10529578.html

    你可能感兴趣的文章
    开发工具-Xshell工具的下载和安装
    查看>>
    重构项目使用Spring+Hibernate+HibernateAnnotation+GenericDao技术
    查看>>
    WORM Worm worm 毛毛虫爬树爬树~
    查看>>
    Ubuntu中root用户和user用户的相互切换(转)
    查看>>
    圆角Panel
    查看>>
    <知识库的构建> 5-2 通过推理进行信息提取 Information Extraction by reasoning
    查看>>
    定时执行自动化脚本-(一)导入保存jmeter参数至文件的jar包
    查看>>
    javascript中加号的一个小用处
    查看>>
    Linux core 文件 gdb
    查看>>
    微信开发之门店管理{"errcode":40097,"errmsg":"invalid args hint: [xxxxxxx]"}
    查看>>
    vim使用案例
    查看>>
    slice、substring、substr的区别
    查看>>
    5.10心得
    查看>>
    Java8 时间类的运用
    查看>>
    3. 布局控件
    查看>>
    关于ThinkCMF自带插件上传不了图片的解决方法
    查看>>
    第十三天
    查看>>
    Android应用开发提高篇(4)-----Socket编程(多线程、双向通信)
    查看>>
    Linux汇编与C互相调用
    查看>>
    权限菜单
    查看>>