启博云微分销/海口百度seo公司
中国天气:你们天天爬人家喔。
使用库 selenium,xpath,mongodb
一.无图无真相
省市县表
一天时段天气信息
二、流程数据分析
点击北京下面是北京的区,点击河北->石家庄->鹿泉。ok,是直辖市只有区,不是是省市县有三层。
用selenium模拟浏览器依次点点点,有人问为啥不用解析Html,看下图因为是js操作,我也没抓到接口。顺道使用selenium练习练习啊
有个selenium真的so easy,我觉得真的是太好用了。啥网页都能搞,只要我一个一个的模拟人点就行啦
1.找到输入框点击弹出下拉框
driver.find_element_by_id("txtZip").click()
2.获取页面所有省份名称
html.xpath("//dd[@id='searchCityList']/a/text()") # 获取省份集合
3.模拟点击省份,判断省份是否为直辖市, 是直辖市直接提取区链接。不是则点击省份获取市
driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click() # 点击市
4.点击市获取县名称和链接
self.driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click() # 点击市
5.获取县链接,用正则表达匹配是否为链接,因为有个别是js代码
# 获取城市连接def get_link(self, str):print("城市名称:", str)html = etree.HTML(self.driver.page_source)cityStr = "//a[@title='{}']/@href".format(str)print("城市链接", cityStr)citylinks = html.xpath(cityStr)print("获取城市链接", citylinks)for link in citylinks:isLink = re.search(r'^http://www.\w+.*', link)if isLink:return link
6.存入mongodb数据库,请求连接使用xpath解析数据就行了,看看我们需要那些数据
times = html.xpath("//div[@class='time']/em/text()") # 时间
print(times)
wpics = html.xpath("//div[@class='wpic']/div/big/@title") # 天气
tems = html.xpath("//div[@id='curve']/div[@class='tem']/em/text()") # 温度
print(tems)
winfs = html.xpath("//div[@class='winf']/em/text()") # 风向
winls = html.xpath("//div[@class='winl']/em/text()") # 等级
三、代码
import asyncio
import re
import time
from lxml import etree
from selenium import webdriver
import pymongoprovince_remove_list = ['北京', '上海', '重庆', '天津'] # 用于判断直辖市
mongo = pymongo.MongoClient(host='localhost', port=27017)
db = mongo['python'] # 获取数据库weatherCollection = db.weather # 设置天气名称
linkCollection = db.link # 设置链接名称
weatherInfoCollection = db.weather_info # 设置天气名称tasks = []# mongodb数据库类(2)
class MongodbHelp:def __init__(self):global db, weatherCollection, linkCollectionself.weatherCollection = weatherCollection # 设置天气名称self.linkCollection = linkCollection # 设置链接名称self.spider = WeatherSpider()self.loop = asyncio.get_event_loop() # 用于异步请求# 插入省市县加跳转链接数据def insert_data_weather(self, data):result = self.weatherCollection.insert_many(data)print(result)# 插入具体城市天气信息def insert_data_weather_info(self, data):result = self.weatherCollection.insert_many(data)# 查询存在的城市进行解析数据def select_data_weather(self):for x in self.weatherCollection.find(): # 查找数据库所有链接数据# print(x)province = x["province"] #获取省份名称if province not in province_remove_list: # 判断是否为直辖市 直辖市不点击取连接 非直辖市点击读取citys = x["citys"]#获取市名称for city in citys:countys = city["countys"] for county in countys:name = county["county"]#获取县名称link = county["link"]#获取天气链接print(name, link)task = asyncio.ensure_future(self.spider.parse_html(link, name))tasks.append(task)else:#如果是直辖市直接取链接link = x["link"] name = provinceprint(name, link)task = asyncio.ensure_future(self.spider.parse_html(link, name))tasks.append(task)#semaphore = asyncio.Semaphore(500) #self.loop.run_until_complete(asyncio.wait(tasks))def close(self):global mongomongo.close()# weather链接解析类(3)
class WeatherSpider:def __init__(self):async def parse_html(self, url, name):list = []# 用于添加无界面参数options = webdriver.ChromeOptions()options.add_argument('--headless')driver = webdriver.Chrome(options=options)driver.get(url) #使用request请求具体数据没有返回,抓多了有反爬了,直接模拟浏览器获取html = etree.HTML(driver.page_source)times = html.xpath("//div[@class='time']/em/text()") # 时间print(times)wpics = html.xpath("//div[@class='wpic']/div/big/@title") # 天气tems = html.xpath("//div[@id='curve']/div[@class='tem']/em/text()") # 温度print(tems)winfs = html.xpath("//div[@class='winf']/em/text()") # 风向winls = html.xpath("//div[@class='winl']/em/text()") # 等级for i in range(len(times)):print(times[i], wpics[i], tems[i], winfs[i], winls[i])info = {"times": times[i],"wpics": wpics[i],"tems": tems[i],"winfs": winfs[i],"winls": winls[i]}list.append(info)data = {"address": name,"weather": list}global weatherCollectionweatherInfoCollection.insert_one(data) # 存入数据库driver.quit()# 浏览器行为类(1)
class WeatherSelenium:def __init__(self):self.driver = webdriver.Chrome()self.driver.get("http://www.weather.com.cn/")self.citys = [] # 市self.countys = [] # 县self.help = MongodbHelp()self.datas = []def get_province(self):self.driver.find_element_by_id("txtZip").click()#找到输入框点击time.sleep(1)provinces = self.get_name_list(1)for province in provinces:time.sleep(2)self.get_city(province) # 传入省份点击print("存入数据---》", self.datas)self.help.insert_data_weather(self.datas)self.citys.clear() # 清除数据self.countys.clear() # 清除数据self.datas.clear() # 清除数据self.driver.quit()def get_name_list(self, isProvince):if isProvince:html = etree.HTML(self.driver.page_source) # 解析htmlprovinces = html.xpath("//dd[@id='searchCityList']/a/text()") # 获取省份集合return provinceselse:html = etree.HTML(self.driver.page_source) # 解析htmlprovinces = html.xpath("//dd[@id='cityList_city']/a/text()") # 获取省份集合return provincesdef get_city(self, province):print("获取省份", province)xpathStr = "//dd[@id='searchCityList']/a[@title='{}']"if province not in province_remove_list: # 判断是否为直辖市 直辖市不点击取连接 非直辖市点击读取self.driver.find_element_by_xpath(xpathStr.format(province)).click() # 点击省份citys = self.get_name_list(0)print(province, str(citys))for city in citys: # 市self.driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click() # 点击市countys = self.get_name_list(0) # 获取县名time.sleep(1)for county in countys:link = self.get_link(county)data = {'county': county,'link': link}print(link)self.countys.append(data) # 添加县集合data = {'city': city,'countys': self.countys}self.citys.append(data)self.countys = [] # 使用[]置为空不是不是跟java中一样存进去就进去了,如果用clear上面保存的数据全部为空了time.sleep(1)self.driver.find_element_by_xpath("//span[@class='province-back']").click()data = {'province': province,'citys': self.citys}self.datas.append(data) self.driver.find_element_by_xpath("//span[@class='province-back']").click() #返回上一级列表else:link = self.get_link(province)data = {'province': province,'link': link}self.datas.append(data)# 获取城市连接def get_link(self, str):print("城市名称:", str)html = etree.HTML(self.driver.page_source)cityStr = "//a[@title='{}']/@href".format(str) print("城市链接", cityStr)citylinks = html.xpath(cityStr)print("获取城市链接", citylinks)for link in citylinks:isLink = re.search(r'^http://www.\w+.*', link)if isLink:return linkif __name__ == '__main__':# 抓取省市县数据selenium = WeatherSelenium()selenium.get_province()# 查询省市县链接解析网页help = MongodbHelp()help.select_data_weather()help.close()
四、总结
- 学习使用selenium模拟浏览器,遇到js执行可以暴力操作.. 缺点需要全部加载浪费资源。
- 使用sleep延迟等待网页加载完成,解析数据,否则会出现元素找不到。
- list的clear是内存清空之前保存的会全部清除,要用[]置空