当前位置：首页 > news >正文

启博云微分销/海口百度seo公司

news 2025/8/17 18:14:47

启博云微分销,海口百度seo公司,博客登陆wordpress,长沙做网站费用中国天气：你们天天爬人家喔。使用库 selenium，xpath，mongodb 一.无图无真相省市县表一天时段天气信息二、流程数据分析点击北京下面是北京的区，点击河北->石家庄->鹿泉。ok，是直辖市只有区，不是…

中国天气：你们天天爬人家喔。

使用库 selenium，xpath，mongodb

一.无图无真相

省市县表

一天时段天气信息

二、流程数据分析

点击北京下面是北京的区，点击河北->石家庄->鹿泉。ok，是直辖市只有区，不是是省市县有三层。

用selenium模拟浏览器依次点点点，有人问为啥不用解析Html，看下图因为是js操作，我也没抓到接口。顺道使用selenium练习练习啊

有个selenium真的so easy，我觉得真的是太好用了。啥网页都能搞，只要我一个一个的模拟人点就行啦

1.找到输入框点击弹出下拉框

driver.find_element_by_id("txtZip").click()

2.获取页面所有省份名称

html.xpath("//dd[@id='searchCityList']/a/text()")  # 获取省份集合

3.模拟点击省份，判断省份是否为直辖市，是直辖市直接提取区链接。不是则点击省份获取市

driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click()  # 点击市

4.点击市获取县名称和链接

self.driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click()  # 点击市

5.获取县链接，用正则表达匹配是否为链接，因为有个别是js代码

  # 获取城市连接def get_link(self, str):print("城市名称：", str)html = etree.HTML(self.driver.page_source)cityStr = "//a[@title='{}']/@href".format(str)print("城市链接", cityStr)citylinks = html.xpath(cityStr)print("获取城市链接", citylinks)for link in citylinks:isLink = re.search(r'^http://www.\w+.*', link)if isLink:return link

6.存入mongodb数据库，请求连接使用xpath解析数据就行了，看看我们需要那些数据

times = html.xpath("//div[@class='time']/em/text()")  # 时间
print(times)
wpics = html.xpath("//div[@class='wpic']/div/big/@title")  # 天气
tems = html.xpath("//div[@id='curve']/div[@class='tem']/em/text()")  # 温度
print(tems)
winfs = html.xpath("//div[@class='winf']/em/text()")  # 风向
winls = html.xpath("//div[@class='winl']/em/text()")  # 等级

三、代码

import asyncio
import re
import time
from lxml import etree
from selenium import webdriver
import pymongoprovince_remove_list = ['北京', '上海', '重庆', '天津']  # 用于判断直辖市
mongo = pymongo.MongoClient(host='localhost', port=27017)
db = mongo['python']  # 获取数据库weatherCollection = db.weather  # 设置天气名称
linkCollection = db.link  # 设置链接名称
weatherInfoCollection = db.weather_info  # 设置天气名称tasks = []# mongodb数据库类（2）
class MongodbHelp:def __init__(self):global db, weatherCollection, linkCollectionself.weatherCollection = weatherCollection  # 设置天气名称self.linkCollection = linkCollection  # 设置链接名称self.spider = WeatherSpider()self.loop = asyncio.get_event_loop()  # 用于异步请求# 插入省市县加跳转链接数据def insert_data_weather(self, data):result = self.weatherCollection.insert_many(data)print(result)# 插入具体城市天气信息def insert_data_weather_info(self, data):result = self.weatherCollection.insert_many(data)# 查询存在的城市进行解析数据def select_data_weather(self):for x in self.weatherCollection.find():  # 查找数据库所有链接数据# print(x)province = x["province"]  #获取省份名称if province not in province_remove_list:  # 判断是否为直辖市 直辖市不点击取连接  非直辖市点击读取citys = x["citys"]#获取市名称for city in citys:countys = city["countys"] for county in countys:name = county["county"]#获取县名称link = county["link"]#获取天气链接print(name, link)task = asyncio.ensure_future(self.spider.parse_html(link, name))tasks.append(task)else:#如果是直辖市直接取链接link = x["link"] name = provinceprint(name, link)task = asyncio.ensure_future(self.spider.parse_html(link, name))tasks.append(task)#semaphore = asyncio.Semaphore(500) #self.loop.run_until_complete(asyncio.wait(tasks))def close(self):global mongomongo.close()# weather链接解析类（3）
class WeatherSpider:def __init__(self):async def parse_html(self, url, name):list = []# 用于添加无界面参数options = webdriver.ChromeOptions()options.add_argument('--headless')driver = webdriver.Chrome(options=options)driver.get(url) #使用request请求具体数据没有返回，抓多了有反爬了，直接模拟浏览器获取html = etree.HTML(driver.page_source)times = html.xpath("//div[@class='time']/em/text()")  # 时间print(times)wpics = html.xpath("//div[@class='wpic']/div/big/@title")  # 天气tems = html.xpath("//div[@id='curve']/div[@class='tem']/em/text()")  # 温度print(tems)winfs = html.xpath("//div[@class='winf']/em/text()")  # 风向winls = html.xpath("//div[@class='winl']/em/text()")  # 等级for i in range(len(times)):print(times[i], wpics[i], tems[i], winfs[i], winls[i])info = {"times": times[i],"wpics": wpics[i],"tems": tems[i],"winfs": winfs[i],"winls": winls[i]}list.append(info)data = {"address": name,"weather": list}global weatherCollectionweatherInfoCollection.insert_one(data)  # 存入数据库driver.quit()# 浏览器行为类（1）
class WeatherSelenium:def __init__(self):self.driver = webdriver.Chrome()self.driver.get("http://www.weather.com.cn/")self.citys = []  # 市self.countys = []  # 县self.help = MongodbHelp()self.datas = []def get_province(self):self.driver.find_element_by_id("txtZip").click()#找到输入框点击time.sleep(1)provinces = self.get_name_list(1)for province in provinces:time.sleep(2)self.get_city(province)  # 传入省份点击print("存入数据---》", self.datas)self.help.insert_data_weather(self.datas)self.citys.clear()  # 清除数据self.countys.clear()  # 清除数据self.datas.clear() # 清除数据self.driver.quit()def get_name_list(self, isProvince):if isProvince:html = etree.HTML(self.driver.page_source)  # 解析htmlprovinces = html.xpath("//dd[@id='searchCityList']/a/text()")  # 获取省份集合return provinceselse:html = etree.HTML(self.driver.page_source)  # 解析htmlprovinces = html.xpath("//dd[@id='cityList_city']/a/text()")  # 获取省份集合return provincesdef get_city(self, province):print("获取省份", province)xpathStr = "//dd[@id='searchCityList']/a[@title='{}']"if province not in province_remove_list:  # 判断是否为直辖市 直辖市不点击取连接  非直辖市点击读取self.driver.find_element_by_xpath(xpathStr.format(province)).click()  # 点击省份citys = self.get_name_list(0)print(province, str(citys))for city in citys:  # 市self.driver.find_element_by_xpath("//dd[@id='cityList_city']/a[@title='{}']".format(city)).click()  # 点击市countys = self.get_name_list(0)  # 获取县名time.sleep(1)for county in countys:link = self.get_link(county)data = {'county': county,'link': link}print(link)self.countys.append(data)  # 添加县集合data = {'city': city,'countys': self.countys}self.citys.append(data)self.countys = []  # 使用[]置为空不是不是跟java中一样存进去就进去了，如果用clear上面保存的数据全部为空了time.sleep(1)self.driver.find_element_by_xpath("//span[@class='province-back']").click()data = {'province': province,'citys': self.citys}self.datas.append(data) self.driver.find_element_by_xpath("//span[@class='province-back']").click() #返回上一级列表else:link = self.get_link(province)data = {'province': province,'link': link}self.datas.append(data)# 获取城市连接def get_link(self, str):print("城市名称：", str)html = etree.HTML(self.driver.page_source)cityStr = "//a[@title='{}']/@href".format(str) print("城市链接", cityStr)citylinks = html.xpath(cityStr)print("获取城市链接", citylinks)for link in citylinks:isLink = re.search(r'^http://www.\w+.*', link)if isLink:return linkif __name__ == '__main__':# 抓取省市县数据selenium = WeatherSelenium()selenium.get_province()# 查询省市县链接解析网页help = MongodbHelp()help.select_data_weather()help.close()