当前位置：首页 > news >正文

合肥城乡建设委员会的网站百度推广视频

news 2025/7/20 13:52:46

合肥城乡建设委员会的网站,百度推广视频,响应式网站怎么做,阳朔网站建设公司1、介绍在爬虫中经常会遇到验证码识别的问题，现在的验证码大多分计算验证码、滑块验证码、识图验证码、语音验证码等四种。本文就是识图验证码，识别的是简单的验证码，要想让识别率更高，识别的更加准确就需要花很多的精力去训练自己…

1、介绍

在爬虫中经常会遇到验证码识别的问题，现在的验证码大多分计算验证码、滑块验证码、识图验证码、语音验证码等四种。本文就是识图验证码，识别的是简单的验证码，要想让识别率更高，识别的更加准确就需要花很多的精力去训练自己的字体库。

识别验证码通常是这几个步骤：

(1)灰度处理

(2)二值化

(3)去除边框(如果有的话)

(4)降噪

(5)切割字符或者倾斜度矫正

(6)训练字体库

(7)识别

这6个步骤中前三个步骤是基本的，4或者5可根据实际情况选择是否需要。

经常用的库有pytesseract(识别库)、OpenCV(高级图像处理库)、imagehash(图片哈希值库)、numpy(开源的、高性能的Python数值计算库)、PIL的 Image，ImageDraw，ImageFile等。

2、实例

以某网站登录的验证码识别为例：具体过程和上述的步骤稍有不同。

首先分析一下，验证码是由4个从0到9等10个数字组成的，那么从0到9这个10个数字没有数字只有第一、第二、第三和第四等4个位置。那么计算下来共有40个数字位置，如下：

那么接下来就要对验证码图片进行降噪、分隔得到上面的图片。以这40个图片集作为基础。

对要验证的验证码图片进行降噪、分隔后获取四个类似上面的数字图片、通过和上面的比对就可以知道该验证码是什么了。

以上面验证码2837为例：

1、图片降噪

2、图片分隔

3、图片比对

通过比验证码降噪、分隔后的四个数字图片，和上面的40个数字图片进行哈希值比对，设置一个误差，max_dif：允许最大hash差值，越小越精确，最小为0。

这样四个数字图片通过比较后获取对应是数字，连起来，就是要获取的验证码。

完整代码如下：

#coding=utf-8

import os

import re

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

import time

from selenium.webdriver.common.action_chains import ActionChains

import collections

import mongoDbBase

import numpy

import imagehash

from PIL import Image,ImageFile

import datetime

class finalNews_IE:

def __init__(self,strdate,logonUrl,firstUrl,keyword_list,exportPath,codepath,codedir):

self.iniDriver()

self.db = mongoDbBase.mongoDbBase()

self.date = strdate

self.firstUrl = firstUrl

self.logonUrl = logonUrl

self.keyword_list = keyword_list

self.exportPath = exportPath

self.codedir = codedir

self.hash_code_dict ={}

for f in range(0,10):

for l in range(1,5):

file = os.path.join(codedir, "codeLibrary\code" + str(f) + '_'+str(l) + ".png")

# print(file)

hash = self.get_ImageHash(file)

self.hash_code_dict[hash]= str(f)

def iniDriver(self):

# 通过配置文件获取IEDriverServer.exe路径

IEDriverServer = "C:\Program Files\Internet Explorer\IEDriverServer.exe"

os.environ["webdriver.ie.driver"] = IEDriverServer

self.driver = webdriver.Ie(IEDriverServer)

def WriteData(self, message, fileName):

fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)

with open(fileName, 'a') as f:

f.write(message)

# 获取图片文件的hash值

def get_ImageHash(self,imagefile):

hash = None

if os.path.exists(imagefile):

with open(imagefile, 'rb') as fp:

hash = imagehash.average_hash(Image.open(fp))

return hash

# 点降噪

def clearNoise(self, imageFile, x=0, y=0):

if os.path.exists(imageFile):

image = Image.open(imageFile)

image = image.convert('L')

image = numpy.asarray(image)

image = (image > 135) * 255

image = Image.fromarray(image).convert('RGB')

# save_name = "D:\work\python36_crawl\Veriycode\mode_5590.png"

# image.save(save_name)

image.save(imageFile)

return image

#切割验证码

# rownum：切割行数；colnum：切割列数；dstpath：图片文件路径；img_name：要切割的图片文件

def splitimage(self, imagePath,imageFile,rownum=1, colnum=4):

img = Image.open(imageFile)

w, h = img.size

if rownum <= h and colnum <= w:

print('Original image info: %sx%s, %s, %s' % (w, h, img.format, img.mode))

print('开始处理图片切割, 请稍候...')

s = os.path.split(imageFile)

if imagePath == '':

dstpath = s[0]

fn = s[1].split('.')

basename = fn[0]

ext = fn[-1]

num = 1

rowheight = h // rownum

colwidth = w // colnum

file_list =[]

for r in range(rownum):

index = 0

for c in range(colnum):

# (left, upper, right, lower)

# box = (c * colwidth, r * rowheight, (c + 1) * colwidth, (r + 1) * rowheight)

if index

colwid = colwidth + 6

elif index

colwid = colwidth + 1

elif index

colwid = colwidth

box = (c * colwid, r * rowheight, (c + 1) * colwid, (r + 1) * rowheight)

newfile = os.path.join(imagePath, basename + '_' + str(num) + '.' + ext)

file_list.append(newfile)

img.crop(box).save(newfile, ext)

num = num + 1

index += 1

return file_list

def compare_image_with_hash(self, image_hash1,image_hash2, max_dif=0):

"""

max_dif: 允许最大hash差值, 越小越精确,最小为0

推荐使用

"""

dif = image_hash1 - image_hash2

# print(dif)

if dif

dif = -dif

if dif <= max_dif:

return True

else:

return False

# 截取验证码图片

def savePicture(self):

self.driver.get(self.logonUrl)

self.driver.maximize_window()

time.sleep(1)

self.driver.save_screenshot(self.codedir +"\Temp.png")

checkcode = self.driver.find_element_by_id("checkcode")

location = checkcode.location # 获取验证码x,y轴坐标

size = checkcode.size # 获取验证码的长宽

rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),

int(location['y'] + size['height'])) # 写成我们需要截取的位置坐标

i = Image.open(self.codedir +"\Temp.png") # 打开截图

result = i.crop(rangle) # 使用Image的crop函数，从截图中再次截取我们需要的区域

filename = datetime.datetime.now().strftime("%M%S")

filename =self.codedir +"\Temp_code.png"

result.save(filename)

self.clearNoise(filename)

file_list = self.splitimage(self.codedir,filename)

verycode =''

for f in file_list:

imageHash = self.get_ImageHash(f)

for h,code in self.hash_code_dict.items():

flag = self.compare_image_with_hash(imageHash,h,0)

if flag:

# print(code)

verycode+=code

break

print(verycode)

self.driver.close()

def longon(self):

self.driver.get(self.logonUrl)

self.driver.maximize_window()

time.sleep(1)

self.savePicture()

accname = self.driver.find_element_by_id("username")

# accname = self.driver.find_element_by_id("//input[@id='username']")

accname.send_keys('ctrchina')

accpwd = self.driver.find_element_by_id("password")

# accpwd.send_keys('123456')

code = self.getVerycode()

checkcode = self.driver.find_element_by_name("checkcode")

checkcode.send_keys(code)

submit = self.driver.find_element_by_name("button")

submit.click()

实例补充：

# -*- coding: utf-8 -*

import sys

reload(sys)

sys.setdefaultencoding( "utf-8" )

import re

import requests

import io

import os

import json

from PIL import Image

from PIL import ImageEnhance

from bs4 import BeautifulSoup

import mdata

class Student:

def __init__(self, user,password):

self.user = str(user)

self.password = str(password)

self.s = requests.Session()

def login(self):

url = "http://202.118.31.197/ACTIONLOGON.APPPROCESS?mode=4"

res = self.s.get(url).text

imageUrl = 'http://202.118.31.197/'+re.findall('

im = Image.open(io.BytesIO(self.s.get(imageUrl).content))

enhancer = ImageEnhance.Contrast(im)

im = enhancer.enhance(7)

x,y = im.size

for i in range(y):

for j in range(x):

if (im.getpixel((j,i))!=(0,0,0)):

im.putpixel((j,i),(255,255,255))

num = [6,19,32,45]

verifyCode = ""

for i in range(4):

a = im.crop((num[i],0,num[i]+13,20))

l=[]

x,y = a.size

for i in range(y):

for j in range(x):

if (a.getpixel((j,i))==(0,0,0)):

l.append(1)

else:

l.append(0)

his=0

chrr="";

for i in mdata.data:

r=0;

for j in range(260):

if(l[j]==mdata.data[i][j]):

r+=1

if(r>his):

his=r

chrr=i

verifyCode+=chrr

# print "辅助输入验证码完毕:",verifyCode

data= {

'WebUserNO':str(self.user),

'Password':str(self.password),

'Agnomen':verifyCode,

}

url = "http://202.118.31.197/ACTIONLOGON.APPPROCESS?mode=4"

t = self.s.post(url,data=data).text

if re.findall("images/Logout2",t)==[]:

l = '[0,"'+re.findall('alert((.+?));',t)[1][1][2:-2]+'"]'+" "+self.user+" "+self.password+"\n"

# print l

# return '[0,"'+re.findall('alert((.+?));',t)[1][1][2:-2]+'"]'

return [False,l]

else:

l = '登录成功 '+re.findall('! (.+?) ',t)[0]+" "+self.user+" "+self.password+"\n"

# print l

return [True,l]

def getInfo(self):

imageUrl = 'http://202.118.31.197/ACTIONDSPUSERPHOTO.APPPROCESS'

data = self.s.get('http://202.118.31.197/ACTIONQUERYBASESTUDENTINFO.APPPROCESS?mode＝3').text #学籍信息

data = BeautifulSoup(data,"lxml")

q = data.find_all("table",attrs={'align':"left"})

a = []

for i in q[0]:

if type(i)==type(q[0]) :

for j in i :

if type(j) ==type(i):

a.append(j.text)

for i in q[1]:

if type(i)==type(q[1]) :

for j in i :

if type(j) ==type(i):

a.append(j.text)

data = {}

for i in range(1,len(a),2):

data[a[i-1]]=a[i]

# data['照片'] = io.BytesIO(self.s.get(imageUrl).content)

return json.dumps(data)

def getPic(self):

imageUrl = 'http://202.118.31.197/ACTIONDSPUSERPHOTO.APPPROCESS'

pic = Image.open(io.BytesIO(self.s.get(imageUrl).content))

return pic

def getScore(self):

score = self.s.get('http://202.118.31.197/ACTIONQUERYSTUDENTSCORE.APPPROCESS').text #成绩单

score = BeautifulSoup(score, "lxml")

q = score.find_all(attrs={'height':"36"})[0]

point = q.text

print point[point.find('平均学分绩点'):]

table = score.html.body.table

people = table.find_all(attrs={'height' : '36'})[0].string

r = table.find_all('table',attrs={'align' : 'left'})[0].find_all('tr')

subject = []

lesson = []

for i in r[0]:

if type(r[0])==type(i):

subject.append(i.string)

for i in r:

k=0

temp = {}

for j in i:

if type(r[0])==type(j):

temp[subject[k]] = j.string

k+=1

lesson.append(temp)

lesson.pop()

lesson.pop(0)

return json.dumps(lesson)

def logoff(self):

return self.s.get('http://202.118.31.197/ACTIONLOGOUT.APPPROCESS').text

if __name__ == "__main__":

a = Student(20150000,20150000)

r = a.login()

print r[1]

if r[0]:

r = json.loads(a.getScore())

for i in r:

for j in i:

print i[j],

q = json.loads(a.getInfo())

for i in q:

print i,q[i]

a.getPic().show()

a.logoff()

到此这篇关于python识别验证码的思路及解决方案的文章就介绍到这了,更多相关python识别验证码的思路是什么内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家！

查看全文

http://www.lbrq.cn/news/2391913.html

武汉网站制作wordpress seo教程

做circrna的网站网站制作教程

邢台瑞光网络科技有限公司关键词排名优化顾问

香港网站icp备案推广赚钱软件排行

网站开发存在的风险爱站网长尾关键词挖掘查询工具

茶网站建设宗旨seo网络科技有限公司

WordPress图片方案上海外贸seo

做门窗五金的网站域名注册管理机构

做网站博彩代理怎么找客源seo优化sem推广

企业网站建设杭州公司seo 网站推广

南通网站制作昨日凌晨北京突然宣布重大消息

哪儿网站建设费用低宁波营销型网站建设优化建站

修改动态网站推广之家app

网站开发包括几个部分余姚网站如何进行优化

公司做的网站版权归谁所有百度关键词搜索引擎

金华市住房建设局网站web网站设计

兰州中川国际机场海外seo网站推广

个人做视频网站视频储存重庆网站设计

wordpress如何创建导航栏河南网站推广优化

张家港做网站的推荐seo网站优化案例

信誉好的常州网站建设汕头seo网络推广服务

电子商务网站建设实训过程2023年8月疫情恢复

宝安住房和建设局网站微信朋友圈广告在哪里做

分销网站有哪些深圳网站建设服务

在摄像机视图中想像在普通 3D 视口里那样随意移动

OpenCV 官翻 3 - 特征检测 Feature Detection

前端下载文件并按GBK编码解析内容

摩尔投票法：高效寻找数组中的多数元素

【C# in .NET】16. 探秘类成员-索引器：通过索引访问对象

Taro.getRandomValues() 用法详解

相关文章：