当前位置: 首页 > news >正文

丽水网站建设seo/廊坊关键词优化排名

丽水网站建设seo,廊坊关键词优化排名,邢台做网站多少钱,网站建设丿金手指谷哥14通过对phpcms数据库字段的填充,实现自动发布文章,手动发布一篇文章并查看数据库中那些table发生变化,即可发现cms(如帝国cms等)文章自动化发布工具开发的突破口! Python# codingutf-8 功能:采集…

通过对phpcms数据库字段的填充,实现自动发布文章,手动发布一篇文章并查看数据库中那些table发生变化,即可发现cms(如帝国cms等)文章自动化发布工具开发的突破口!

Python
# coding=utf-8 '''功能:采集百度新闻(http://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。 主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次 ''' import pycurl,StringIO,json,urllib,urllib2,re import MySQLdb import time from warnings import filterwarnings import MySQLdb as Database filterwarnings('ignore', category = Database.Warning) import sys reload(sys) sys.setdefaultencoding('utf8') headers = [ "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36", "Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226", ] def curl(url): c = pycurl.Curl() #通过curl方法构造一个对象 #c.setopt(pycurl.REFERER, 'http://qy.m.58.com/') #设置referer c.setopt(pycurl.FOLLOWLOCATION, True) #自动进行跳转抓取 c.setopt(pycurl.MAXREDIRS,5) #设置最多跳转多少次 c.setopt(pycurl.CONNECTTIMEOUT, 60) #设置链接超时 c.setopt(pycurl.TIMEOUT,120) #下载超时 c.setopt(pycurl.ENCODING, 'gzip,deflate') # c.setopt(c.PROXY,ip) # 代理 c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) #设置要访问的URL c.setopt(pycurl.HTTPHEADER,headers) #传入请求头 # c.setopt(pycurl.POST, 1) # c.setopt(pycurl.POSTFIELDS, data) #传入POST数据 c.setopt(c.WRITEFUNCTION, c.fp.write) #回调写入字符串缓存 c.perform() code = c.getinfo(c.HTTP_CODE) #返回状态码 html = c.fp.getvalue() #返回源代码 return html # 通过正则提取元素 def search(req,html): text = re.search(req,html) if text: data = text.group(1) else: data = 'no' return data # 去除文章url、多余标签等、补全路径等 def content_sort(content): content = re.sub('<p.*?>','<p>',content,flags=re.I) content = re.sub('<P.*?>','<p>',content) content = re.sub('</?span.*?>','',content) content = re.sub('</?a.*?>','',content) content = re.sub('<!.*?>','',content) content = re.sub('</?img.*?>','',content,re.IGNORECASE) content = re.sub('</?IMG.*?>','',content,re.IGNORECASE) content = re.sub('</?div.*?>','',content,flags=re.I) content = re.sub('</?DIV.*?>','',content) content = re.sub('</?iframe.*?>','',content) content = re.sub('</?center.*?>','',content) content = re.sub('</?[fF].*?>','',content) content = re.sub('<script.*?>[\s\S]*?</script>','',content) content = re.sub('</?strong.*?>','',content) content = re.sub('<INPUT.*?>','',content) content = re.sub('<style.*?>[\s\S]*?</style>','',content) content = re.sub(' ','',content) content = re.sub(' ','',content) content = re.sub(' ','',content) return content #域名与正则、编码对应表 req_dict = { 'finance.sina.com.cn': {'title':'<h1.*?>(.*?)</h1>','content':'<!-- 原始正文start -->([\s\S]*?)<!-- 原始正文end -->','decode':'utf-8'}, 'stock.eastmoney.com': {'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'}, 'finance.eastmoney.com': {'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'},#ok 'guba.eastmoney.com': {'title':'<title>(.*?)_.*?</title>','content':'<div id="zwconbody">([\s\S]*?)<div class="zwconbtns clearfix">','decode':'utf-8'},#ok 'stock.jrj.com.cn': {'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'}, 'hk.jrj.com.cn': {'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'}, 'hkstock.cnfol.com': {'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok 'sc.stock.cnfol.com': {'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok 'money.163.com': {'title':'<title>(.*?)_.*?</title>','content':'<div class="post_text".*?">([\s\S]*?)<!--.*?s -->','decode':'utf-8'}, 'www.chinastock.com.cn': {'title':'<div class="d_title">([\s\S]*?)</div>','content':'<div class="d_content" id="Zoom">([\s\S]*?)<div class="dleft_new_attachment">','decode':'utf-8'}, 'stock.huagu.com': {'title':'<h1 id="h1-title">([\s\S]*?)</h1>','content':'<div class="article_con" id="div-article-content">([\s\S]*?)<div class="clear"></div>','decode':'utf-8'}, 'stock.sohu.com': {'title':'<h1 itemprop="headline">([\s\S]*?)</h1>','content':'<div itemprop="articleBody">([\s\S]*?)<div class="original-title"','decode':'gbk'}, 'stock.cngold.org': {'title':'<title>(.*?)-.*?</title>','content':'<div class="det_content" id="zoom">([\s\S]*?)<div class="listPage">','decode':'utf-8'}, 'hk.stock.hexun.com': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'utf-8'}, 'stock.gucheng.com': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="content">([\s\S]*?)</div>','decode':'utf-8'}, 'www.cnstock.com': {'title':'<title>(.*?)-.*?</title>','content':'<div class="content-inner" id="qmt_content_div">([\s\S]*?)</div>','decode':'gbk'}, 'www.ccstock.cn': {'title':'<title>(.*?)-.*?</title>','content':'<div id="newscontent">([\s\S]*?)</div>','decode':'utf-8'}, 'news.emoney.cn': {'title':'<title>(.*?)-.*?</title>','content':'<div class="RL_details_content">([\s\S]*?)<div class="PageNav">','decode':'utf-8'}, 'finance.ce.cn': {'title':'<title>(.*?)</title>','content':'<div class=TRS_Editor>([\s\S]*?)<textarea id="allinfo"','decode':'gbk'}, 'www.p5w.net': {'title':'<title>(.*?)[_-|].*?</title>','content':'<div class="text">([\s\S]*?)<div class="pages">','decode':'gbk'}, 'www.nbd.com.cn': {'title':'<title>(.*?)[_-|][\s\S]*?</title>','content':'<div class="main-left-article">([\s\S]*?)<div style="overflow:','decode':'utf-8'}, 'stock.hexun.com': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'gbk'}, 'stock.caijing.com.cn': {'title':'<title>(.*?)[-_|].*?</title>','content':'<div id="the_content".*?>([\s\S]*?)<div class="ar_writer"','decode':'utf-8'}, } def id(): '''获取标题对应id,构建url.我用的是phpcms,前台显示需将url写入数据库''' con = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8') with con: cur = con.cursor() cur.execute("select id from v9_news where title = title") numrows = int(cur.rowcount) return numrows+1 def CmsSQL(title,content): '''写入数据,如何将多个数据写入数据库可参考''' value1 = [] value1.append(content) value1.append(idnum) value2 = [] value2.append(title) value2.append(urlid) value2.append(int(time.time())) value2.append(int(time.time())) db = MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8') cursor = db.cursor() cursor.execute("insert into v9_news_data (content,id) values(%s,%s)" ,value1) cursor.execute("insert into v9_news(title,catid,typeid,url,inputtime,updatetime) values(%s,6,0,%s,%s,%s)",value2) db.commit() db.close() url = 'http://news.baidu.com/n?cmd=4&class=gegu&tn=rss' urls = re.findall(r'<link><!\[CDATA\[(.*?)\]\]></link>',curl(url)) urls.reverse() for url in urls: with open('urls.txt') as f1 : if url not in f1.read(): #判断url是否采集过 url.strip() f1.close() line = url.split('/')[2] if req_dict.has_key(line): #通过键位是否存在判断这个网站是否写好的正则 time.sleep(1) try: title = search(req_dict[line]['title'],curl(url)).decode(req_dict[line]['decode']) #网址与正则及网页编码对应起来 content = url + search(req_dict[line]['content'],curl(url)).decode(req_dict[line]['decode']) except: continue urlid = 'http://localhost/index.php?m=content&c=index&a=show&catid=6&id=%s' %id() idnum = int(id()) print id(),content_sort(title) CmsSQL(content_sort(title),content_sort(content)) f1w =open('urls.txt','a+') f1w.write(url+'\n') f1w.close() else: print u'正则不存在' open('requrl','a+').write(url+'\n') else: print u'此url在列表中:'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# coding=utf-8
'''功能:采集百度新闻(http://news.baidu.com/)内容,百度新闻聚合了许多行业网站的新闻,已经帮我们去重筛选了,采集自己行业的新闻数据很不错。
主要思路:1,利用字典把各个网站的网址与正则及网页编码对应起来 2,把采集过得url放到一个文件中,判断是否采集过 3,百度新闻5分钟跟新一次,可以再建个程序每隔几分钟运行一次
'''
importpycurl,StringIO,json,urllib,urllib2,re
importMySQLdb
importtime
fromwarningsimportfilterwarnings
importMySQLdb asDatabase
filterwarnings('ignore',category=Database.Warning)
importsys
reload(sys)
sys.setdefaultencoding('utf8')
headers=[
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
"Cookie: spversion=20130314; historystock=603158%7C*%7C1A0001%7C*%7C000967%7C*%7C603328; Hm_lvt_78c58f01938e4d85eaf619eae71b4ed1=1467682875,1467682943,1467682974,1468293176; Hm_lpvt_78c58f01938e4d85eaf619eae71b4ed1=1468293226",
]
defcurl(url):
c=pycurl.Curl()#通过curl方法构造一个对象
#c.setopt(pycurl.REFERER, 'http://qy.m.58.com/')    #设置referer
c.setopt(pycurl.FOLLOWLOCATION,True)#自动进行跳转抓取
c.setopt(pycurl.MAXREDIRS,5)#设置最多跳转多少次
c.setopt(pycurl.CONNECTTIMEOUT,60)#设置链接超时
c.setopt(pycurl.TIMEOUT,120)#下载超时
c.setopt(pycurl.ENCODING,'gzip,deflate')
# c.setopt(c.PROXY,ip)  # 代理
c.fp=StringIO.StringIO()
c.setopt(pycurl.URL,url)#设置要访问的URL
c.setopt(pycurl.HTTPHEADER,headers)#传入请求头
# c.setopt(pycurl.POST, 1)
# c.setopt(pycurl.POSTFIELDS, data)     #传入POST数据
c.setopt(c.WRITEFUNCTION,c.fp.write)#回调写入字符串缓存
c.perform()
code=c.getinfo(c.HTTP_CODE)#返回状态码
html=c.fp.getvalue()#返回源代码
returnhtml
# 通过正则提取元素
defsearch(req,html):
text=re.search(req,html)
iftext:
data=text.group(1)
else:
data='no'
returndata
# 去除文章url、多余标签等、补全路径等
defcontent_sort(content):
content=re.sub('<p.*?>','<p>',content,flags=re.I)
content=re.sub('<P.*?>','<p>',content)
content=re.sub('</?span.*?>','',content)
content=re.sub('</?a.*?>','',content)
content=re.sub('<!.*?>','',content)
content=re.sub('</?img.*?>','',content,re.IGNORECASE)
content=re.sub('</?IMG.*?>','',content,re.IGNORECASE)
content=re.sub('</?div.*?>','',content,flags=re.I)
content=re.sub('</?DIV.*?>','',content)
content=re.sub('</?iframe.*?>','',content)
content=re.sub('</?center.*?>','',content)
content=re.sub('</?[fF].*?>','',content)
content=re.sub('<script.*?>[\s\S]*?</script>','',content)
content=re.sub('</?strong.*?>','',content)
content=re.sub('<INPUT.*?>','',content)
content=re.sub('<style.*?>[\s\S]*?</style>','',content)
content=re.sub(' ','',content)
content=re.sub(' ','',content)
content=re.sub(' ','',content)
returncontent
#域名与正则、编码对应表
req_dict={
'finance.sina.com.cn':
{'title':'<h1.*?>(.*?)</h1>','content':'<!-- 原始正文start -->([\s\S]*?)<!-- 原始正文end -->','decode':'utf-8'},
'stock.eastmoney.com':
{'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'},
'finance.eastmoney.com':
{'title':'<h1.*?>(.*?)</h1>','content':'<div id="ContentBody" class="Body">([\s\S]*?)<div class="BodyEnd">','decode':'gbk'},#ok
'guba.eastmoney.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div id="zwconbody">([\s\S]*?)<div class="zwconbtns clearfix">','decode':'utf-8'},#ok
'stock.jrj.com.cn':
{'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'},
'hk.jrj.com.cn':
{'title':'<title>(.*?)-','content':'<div class="texttit_m1">([\s\S]*?)<div id="itougu">','decode':'gbk'},
'hkstock.cnfol.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok
'sc.stock.cnfol.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div class="ArtM" id="Content">([\s\S]*?)<!--正文结束-->','decode':'utf-8'},#ok
'money.163.com':
{'title':'<title>(.*?)_.*?</title>','content':'<div class="post_text".*?">([\s\S]*?)<!--.*?s -->','decode':'utf-8'},
'www.chinastock.com.cn':
{'title':'<div class="d_title">([\s\S]*?)</div>','content':'<div class="d_content" id="Zoom">([\s\S]*?)<div class="dleft_new_attachment">','decode':'utf-8'},
'stock.huagu.com':
{'title':'<h1 id="h1-title">([\s\S]*?)</h1>','content':'<div class="article_con" id="div-article-content">([\s\S]*?)<div class="clear"></div>','decode':'utf-8'},
'stock.sohu.com':
{'title':'<h1 itemprop="headline">([\s\S]*?)</h1>','content':'<div itemprop="articleBody">([\s\S]*?)<div class="original-title"','decode':'gbk'},
'stock.cngold.org':
{'title':'<title>(.*?)-.*?</title>','content':'<div class="det_content" id="zoom">([\s\S]*?)<div class="listPage">','decode':'utf-8'},
'hk.stock.hexun.com':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'utf-8'},
'stock.gucheng.com':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="content">([\s\S]*?)</div>','decode':'utf-8'},
'www.cnstock.com':
{'title':'<title>(.*?)-.*?</title>','content':'<div class="content-inner" id="qmt_content_div">([\s\S]*?)</div>','decode':'gbk'},
'www.ccstock.cn':
{'title':'<title>(.*?)-.*?</title>','content':'<div id="newscontent">([\s\S]*?)</div>','decode':'utf-8'},
'news.emoney.cn':
{'title':'<title>(.*?)-.*?</title>','content':'<div class="RL_details_content">([\s\S]*?)<div class="PageNav">','decode':'utf-8'},
'finance.ce.cn':
{'title':'<title>(.*?)</title>','content':'<div class=TRS_Editor>([\s\S]*?)<textarea id="allinfo"','decode':'gbk'},
'www.p5w.net':
{'title':'<title>(.*?)[_-|].*?</title>','content':'<div class="text">([\s\S]*?)<div class="pages">','decode':'gbk'},
'www.nbd.com.cn':
{'title':'<title>(.*?)[_-|][\s\S]*?</title>','content':'<div class="main-left-article">([\s\S]*?)<div style="overflow:','decode':'utf-8'},
'stock.hexun.com':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div class="art_contextBox">([\s\S]*?)<div class="showAll">','decode':'gbk'},
'stock.caijing.com.cn':
{'title':'<title>(.*?)[-_|].*?</title>','content':'<div id="the_content".*?>([\s\S]*?)<div class="ar_writer"','decode':'utf-8'},
}
defid():
'''获取标题对应id,构建url.我用的是phpcms,前台显示需将url写入数据库'''
con=MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8')
withcon:
cur=con.cursor()
cur.execute("select id from v9_news where title = title")
numrows=int(cur.rowcount)
returnnumrows+1
defCmsSQL(title,content):
'''写入数据,如何将多个数据写入数据库可参考'''
value1=[]
value1.append(content)
value1.append(idnum)
value2=[]
value2.append(title)
value2.append(urlid)
value2.append(int(time.time()))
value2.append(int(time.time()))
db=MySQLdb.connect('localhost','root','','phpcmsv9',charset='utf8')
cursor=db.cursor()
cursor.execute("insert into v9_news_data (content,id) values(%s,%s)",value1)
cursor.execute("insert into v9_news(title,catid,typeid,url,inputtime,updatetime) values(%s,6,0,%s,%s,%s)",value2)
db.commit()
db.close()
url='http://news.baidu.com/n?cmd=4&class=gegu&tn=rss'
urls=re.findall(r'<link><!\[CDATA\[(.*?)\]\]></link>',curl(url))
urls.reverse()
forurl inurls:
withopen('urls.txt')asf1:
ifurl notinf1.read():#判断url是否采集过
url.strip()
f1.close()
line=url.split('/')[2]
ifreq_dict.has_key(line):#通过键位是否存在判断这个网站是否写好的正则
time.sleep(1)
try:
title=search(req_dict[line]['title'],curl(url)).decode(req_dict[line]['decode'])#网址与正则及网页编码对应起来
content=url+search(req_dict[line]['content'],curl(url)).decode(req_dict[line]['decode'])
except:
continue
urlid='http://localhost/index.php?m=content&c=index&a=show&catid=6&id=%s'%id()
idnum=int(id())
printid(),content_sort(title)
CmsSQL(content_sort(title),content_sort(content))
f1w=open('urls.txt','a+')
f1w.write(url+'\n')
f1w.close()
else:
printu'正则不存在'
open('requrl','a+').write(url+'\n')
else:
printu'此url在列表中:'

** 本文转自:http://bigwayseo.com/2456




  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
http://www.lbrq.cn/news/1271647.html

相关文章:

  • saas云建站平台源码/外贸推广平台排名
  • 如何购买网站主机/互联网推广
  • 青岛seo软件/windows11优化大师
  • php做网站用html做吗/没有限制的国外搜索引擎
  • wp做网站/最新新闻热点事件
  • dwcc如何做网站/网络软文发布
  • 做网站地图邮什么好处/小说推文推广平台
  • 企业门户网站的意义/百度百度一下
  • 青岛模板做网站/福州百度推广排名优化
  • 个人网站备案网址导航/广州广告推广公司
  • 电商网站建设推荐/下载百度官方网站
  • 网站开发可以开发哪些/seo专员招聘
  • 日本人真人做真爱免费的网站/微信搜一搜排名优化
  • 阎良网站建设/sem与seo
  • 电话销售做网站犯法吗/网站自然排名优化
  • 通江移动网站建设/百度广告业务
  • 苏州谢谢网络公司/百度推广账户优化
  • 教育网站建设收费/免费个人网站建设
  • 幼儿园主题活动网络图/海淀搜索引擎优化seo
  • 手机上如何制作app/安卓优化大师清理
  • 新疆建设厅网站房屋租赁合同/企业seo职位
  • 沈阳网站建设公司电话/流量网站
  • 建网站的公司广州排名/运营培训班学费大概多少
  • 用动易做的校园网站/推广软文发布平台
  • 如何留住网站用户/微信代运营
  • 网站发外链/谷歌浏览器官网下载
  • 校园网站集群建设/百度广告收费表
  • 网站实现中英文/windows7优化大师
  • 网站宽屏/什么叫网络营销
  • 网络问卷制作平台/厦门seo厦门起梦
  • 自动驾驶中的传感器技术15——Camera(6)
  • 基于深度学习的医学图像分析:使用MobileNet实现医学图像分类
  • 数据结构初学习、单向链表
  • chrome的数据采集插件chat4data的使用
  • Day25-对称二叉树-
  • JVM学习日记(十四)Day14——性能监控与调优(一)