概览页抓取链接
1 import requests 2 import re 3 import pymysql 4 import hashlib 5 import datetime 6 7 8 class Demo(object): 9 def __init__(self): 10 self.host = '127.0.0.1' 11 self.db = 'app_mark' 12 self.user = 'root' 13 self.passwd = '123456' 14 self.charset = 'utf8mb4' 15 self.headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 17 } 18 self.url = 'http://www.lvmama.com/' 19 self.channel_link = [ 20 'http://s.lvmama.com/group/H13K110000?keyword=%E6%99%AE%E5%90%89%E5%B2%9B&k=0#list', # 海岛 21 'http://s.lvmama.com/route/H13K310000?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list', # 东南亚 22 'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list', # 中国港澳台 23 'http://s.lvmama.com/group/H13K110000?keyword=%E8%BF%AA%E6%8B%9C&k=0#list', # 迪拜 24 'http://s.lvmama.com/group/C262H13K110000?keyword=%E4%BF%84%E7%BD%97%E6%96%AF&tabType=group#list', # 俄罗斯 25 'http://s.lvmama.com/group/H13K110000Y4?keyword=%E8%B6%8A%E5%8D%97#list#list', # 越南 26 'http://s.lvmama.com/group/C265H13K110000?keyword=%E6%B3%95%E5%9B%BD&tabType=group#list%22', # 法瑞意德 27 'http://s.lvmama.com/group/H13K110000?keyword=%E5%B7%B4%E5%8E%98%E5%B2%9B&k=0#list', # 巴厘岛 28 'http://s.lvmama.com/route/H13K310000?keyword=%E6%97%A5%E6%9C%AC&k=0#list', # 日本 29 'http://s.lvmama.com/route/H13K310000?keyword=%E6%AC%A7%E6%B4%B2&k=0#list', # 欧美 30 'http://s.lvmama.com/route/H13K440100?keyword=%E6%96%B0%E5%8A%A0%E5%9D%A1&k=0#list', # 新加坡 31 'http://s.lvmama.com/route/H13K310000?keyword=%E9%A6%99%E6%B8%AF&k=0#list', # 香港 32 'http://s.lvmama.com/route/H13K310000?keyword=%E6%BE%B3%E6%B4%B2&k=0#list', # 澳洲 33 'http://s.lvmama.com/route/H13K310000?keyword=%E6%B3%B0%E5%9B%BD&k=0#list', # 泰国 34 'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%89%E4%BA%9A&k=0#list', # 三亚 35 'http://s.lvmama.com/route/H13K440300P2?keyword=%E4%B8%89%E4%BA%9A&tabType=route350', # 三亚p2 36 'http://s.lvmama.com/route/H13K440300P3?keyword=%E4%B8%89%E4%BA%9A&tabType=route350', # 三亚p3 37 'http://s.lvmama.com/route/H13K440300P4?keyword=%E4%B8%89%E4%BA%9A&tabType=route350', # 三亚p4 38 'http://s.lvmama.com/route/H13K440300?keyword=%E5%8E%A6%E9%97%A8&k=0#list', # 厦门 39 'http://s.lvmama.com/route/H13K440300?keyword=%E5%B9%BF%E4%B8%9C&k=0#list', # 广东 40 'http://s.lvmama.com/route/H13K440300?keyword=%E4%BA%91%E5%8D%97&k=0#list', # 云南 41 'http://s.lvmama.com/route/H13K440300?keyword=%E4%B8%8A%E6%B5%B7&k=0#list', # 上海 42 'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%AE%89&k=0#list', # 西安 43 'http://s.lvmama.com/route/H13K440300?keyword=%E6%88%90%E9%83%BD&k=0#list', # 成都 44 'http://s.lvmama.com/route/H13K440300?keyword=%E5%90%89%E6%9E%97&k=0#list', # 吉林 45 'http://s.lvmama.com/route/H13K440300?keyword=%E8%A5%BF%E5%8C%97&k=0#list', # 西北 46 'http://s.lvmama.com/scenictour/K110000?keyword=%E5%8C%97%E4%BA%AC&k=0#list', # 北京 47 'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E4%B8%9C&k=0#list', # 山东 48 'http://s.lvmama.com/scenictour/K110000?keyword=%E5%B1%B1%E8%A5%BF&k=0#list', # 山西 49 'http://s.lvmama.com/scenictour/K110000?keyword=%E6%B2%B3%E5%8C%97&k=0#list', # 河北 50 'http://s.lvmama.com/scenictour/K110000?keyword=%E8%BE%BD%E5%AE%81&k=0#list', # 辽宁 51 ] 52 self.channel_name = [ 53 '海岛', 54 '东南亚', 55 '中国港澳台', 56 '迪拜', 57 '俄罗斯', 58 '越南', 59 '法瑞意德', 60 '巴厘岛', 61 '日本', 62 '欧洲', 63 '新加坡', 64 '香港', 65 '澳洲', 66 '泰国', 67 '三亚', 68 '三亚p2', 69 '三亚p3', 70 '三亚p4', 71 '厦门', 72 '广东', 73 '云南', 74 '上海', 75 '西安', 76 '成都', 77 '吉林', 78 '西北', 79 '北京', 80 '山东', 81 '山西', 82 '河北', 83 '辽宁', 84 ] 85 86 def get_html(self, url): 87 response = requests.get(url, headers=self.headers) 88 response.encoding = response.apparent_encoding 89 html = response.text 90 return html 91 92 def get_data(self): 93 # 首页抓取 94 # html = self.get_html(self.url) 95 # datas = re.findall('
细览页解析字段
1 import pymysql 2 import re 3 import requests 4 from multiprocessing.dummy import Pool as ThreadPool 5 import datetime 6 7 8 class XLY(object): 9 def __init__(self): 10 self.host = '127.0.0.1' 11 self.db = 'app_mark' 12 self.user = 'root' 13 self.passwd = '123456' 14 self.charset = 'utf8mb4' 15 self.headers = { 16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' 17 } 18 self.start = datetime.datetime.now() 19 20 def get_data(self): 21 # 从gly表中拿链接 22 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset) 23 cur = con.cursor() 24 sql = 'select link from gly where tag = "1" and sitename="驴妈妈旅游"' 25 after_sql = 'update gly set tag="1" where tag="0" and sitename = "驴妈妈旅游"' 26 try: 27 cur.execute(sql) 28 results = cur.fetchall() 29 cur.execute(after_sql) 30 except Exception as e: 31 con.rollback() 32 results = None 33 print('error~', e) 34 else: 35 con.commit() 36 cur.close() 37 con.close() 38 return results 39 40 def parse_data(self, url): 41 # 正则匹配各个字段 42 print(url) 43 url = url[0] 44 # 匹配id 45 id = url.split('/')[-1] 46 id = re.sub('\?.*', '', id) 47 # print(id) 48 response = requests.get(url, headers=self.headers) 49 html = response.text 50 if 'scenic' not in url and 'hotel' not in url: 51 # 去掉酒店和景点 52 # 匹配标题 53 title = re.findall('(.*?)', html, re.S) 54 if title: 55 title = title[0] 56 title = re.sub('\n|\r| |自营|<[\s\S]*?>', '', title) 57 title = title.strip() 58 else: 59 title = re.findall(' (.*?)
', html, re.S) 60 if title: 61 title = title[0] 62 title = re.sub('\n|\r| |自营|<[\s\S]*?>', '', title) 63 title = title.strip() 64 # 匹配价格 65 price = re.findall('(\d+)', html, re.S) 66 if price: 67 price = price[0] 68 else: 69 price = re.findall(' .*?(\d+).*?', html, re.S) 70 if price: 71 price = price[0] 72 else: 73 price = re.findall('¥ (\d+)', html, re.S) 74 if price: 75 price = price[0] 76 else: 77 price = re.findall(' .*?(\d+).*?', html, re.S) 78 if price: 79 price = price[0] 80 else: 81 price = None 82 # 匹配好评率 83 praise = re.findall(' [\s\S]*?([\s\S]*?)[\s\S]*?
', html, re.S) 84 if praise: 85 praise = praise[0] 86 praise = re.sub('<.*?>', '', praise) 87 praise = praise.strip() 88 else: 89 praise = re.findall(' ([\s\S]*?)', html, re.S) 90 if praise: 91 praise = praise[0] 92 else: 93 praise = re.findall('([\s\S]*?)', html, re.S) 94 if praise: 95 praise = praise[0] 96 praise = praise.strip() 97 else: 98 praise = re.findall('[\s\S]*?([\s\S]*?)%[\s\S]*?', html, re.S) 99 if praise:100 praise = praise[0]101 praise = praise.strip()102 else:103 praise = re.findall('([\s\S]*?)', html, re.S)104 if praise:105 praise = praise[0]106 if praise:107 if '%' in praise:108 praise = re.sub('%', '', praise)109 praise = float(praise)110 if praise > 100:111 praise = None112 print('好评率抓取错误')113 else:114 pass115 else:116 praise = None117 # 匹配出发地118 starting_city = re.findall('
[\s\S]*?出发城市[\s\S]*?
([\s\S]*?) ', html, re.S)119 target_city = re.findall('- 目的地[\s\S]*?
- ([\s\S]*?)
', html, re.S)120 if starting_city:121 starting_city = starting_city[0]122 starting_city = re.sub('<.*?>', '', starting_city)123 # 匹配目的地124 target_city = target_city[0]125 target_city = re.sub('<.*?>', '', target_city)126 # 匹配天数127 days_spent = re.findall('- 出游天数[\s\S]*?
- ([\s\S]*?)
', html, re.S)[0]128 days_spent = re.sub('<.*?>', '', days_spent)129 # print(days_spent)130 else:131 starting_city = target_city = days_spent = None132 # 匹配类型133 type_ = re.findall(' ([\s\S]*?)', html, re.S)134 if type_:135 type_ = type_[0]136 else:137 type_ = re.findall(' ([\s\S]*?)', html, re.S)138 if type_:139 type_ = type_[0]140 else:141 type_ = re.findall(' ([\s\S]*?)', html, re.S)142 if type_:143 type_ = type_[0]144 else:145 type_ = None146 # print(type_)147 list_data = [id, title, price, praise, starting_city, target_city, days_spent, type_, url]148 self.save_data(list_data)149 150 def save_data(self, list_data):151 # 写入数据库152 con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)153 cur = con.cursor()154 sql = 'insert into lvmama(id_num, title, price, praise, starting_city, target_city, days_spent, type_, link) values (%s, %s, %s, %s, %s, %s, %s, %s, %s)'155 # cur.execute(sql, list_data)156 # con.commit()157 try:158 cur.execute(sql, list_data)159 print('insert success')160 except Exception as e:161 con.rollback()162 print('error~', e)163 else:164 con.commit()165 cur.close()166 con.close()167 168 169 if __name__ == '__main__':170 xly = XLY()171 urls = xly.get_data()172 if urls:173 # 开启多线程174 pool = ThreadPool(20)175 pool.map(xly.parse_data, urls)176 pool.close()177 pool.join()178 end = datetime.datetime.now()179 print('耗时:', (end-xly.start))180 # for url in urls:181 # url = url[0]182 # xly.parse_data(url)183 # break