视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
python识别验证码
2020-11-27 14:22:19 责编:小采
文档


这次给大家带来python识别验证码,python识别验证码的注意事项有哪些,下面就是实战案例,一起来看一下。

除了传统的PIL包处理图片,然后用pytessert+OCR识别意外,还可以使用tessorflow训练来识别验证码。

此篇代码大部分是转载的,只改了很少地方。

代码是运行在linux环境,tessorflow没有支持windows的python 2.7。

gen_captcha.py代码。

#coding=utf-8
from captcha.image import ImageCaptcha # pip install captcha
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import random
# 验证码中的字符, 就不用汉字了
number = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
 'v', 'w', 'x', 'y', 'z']
ALPHABET = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
 'V', 'W', 'X', 'Y', 'Z']
'''
number=['0','1','2','3','4','5','6','7','8','9']
alphabet =[]
ALPHABET =[]
'''
# 验证码一般都无视大小写;验证码长度4个字符
def random_captcha_text(char_set=number + alphabet + ALPHABET, captcha_size=4):
 captcha_text = []
 for i in range(captcha_size):
 c = random.choice(char_set)
 captcha_text.append(c)
 return captcha_text
# 生成字符对应的验证码
def gen_captcha_text_and_image():
 while(1):
 image = ImageCaptcha()
 captcha_text = random_captcha_text()
 captcha_text = ''.join(captcha_text)
 captcha = image.generate(captcha_text)
 #image.write(captcha_text, captcha_text + '.jpg') # 写到文件
 captcha_image = Image.open(captcha)
 #captcha_image.show()
 captcha_image = np.array(captcha_image)
 if captcha_image.shape==(60,160,3):
 break
 return captcha_text, captcha_image
if name == 'main':
 # 测试
 text, image = gen_captcha_text_and_image()
 print image
 gray = np.mean(image, -1)
 print gray
 print image.shape
 print gray.shape
 f = plt.figure()
 ax = f.add_subplot(111)
 ax.text(0.1, 0.9, text, ha='center', va='center', transform=ax.transAxes)
 plt.imshow(image)
 plt.show()

train.py代码。

#coding=utf-8
from gen_captcha import gen_captcha_text_and_image
from gen_captcha import number
from gen_captcha import alphabet
from gen_captcha import ALPHABET
import numpy as np
import tensorflow as tf
"""
text, image = gen_captcha_text_and_image()
print "验证码图像channel:", image.shape # (60, 160, 3)
# 图像大小
IMAGE_HEIGHT = 60
IMAGE_WIDTH = 160
MAX_CAPTCHA = len(text)
print "验证码文本最长字符数", MAX_CAPTCHA # 验证码最长4字符; 我全部固定为4,可以不固定. 如果验证码长度小于4,用'_'补齐
"""
IMAGE_HEIGHT = 60
IMAGE_WIDTH = 160
MAX_CAPTCHA = 4
# 把彩色图像转为灰度图像(色彩对识别验证码没有什么用)
def convert2gray(img):
 if len(img.shape) > 2:
 gray = np.mean(img, -1)
 # 上面的转法较快,正规转法如下
 # r, g, b = img[:,:,0], img[:,:,1], img[:,:,2]
 # gray = 0.29 * r + 0.5870 * g + 0.1140 * b
 return gray
 else:
 return img
"""
cnn在图像大小是2的倍数时性能最高, 如果你用的图像大小不是2的倍数,可以在图像边缘补无用像素。
np.pad(image,((2,3),(2,2)), 'constant', constant_values=(255,)) # 在图像上补2行,下补3行,左补2行,右补2行
"""
# 文本转向量
char_set = number + alphabet + ALPHABET + ['_'] # 如果验证码长度小于4, '_'用来补齐
CHAR_SET_LEN = len(char_set)
def text2vec(text):
 text_len = len(text)
 if text_len > MAX_CAPTCHA:
 raise ValueError('验证码最长4个字符')
 vector = np.zeros(MAX_CAPTCHA * CHAR_SET_LEN)
 def char2pos(c):
 if c == '_':
 k = 62
 return k
 k = ord(c) - 48
 if k > 9:
 k = ord(c) - 55
 if k > 35:
 k = ord(c) - 61
 if k > 61:
 raise ValueError('No Map')
 return k
 for i, c in enumerate(text):
 #print text
 idx = i * CHAR_SET_LEN + char2pos(c)
 #print i,CHAR_SET_LEN,char2pos(c),idx
 vector[idx] = 1
 return vector
#print text2vec('1aZ_')
# 向量转回文本
def vec2text(vec):
 char_pos = vec.nonzero()[0]
 text = []
 for i, c in enumerate(char_pos):
 char_at_pos = i # c/63
 char_idx = c % CHAR_SET_LEN
 if char_idx < 10:
 char_code = char_idx + ord('0')
 elif char_idx < 36:
 char_code = char_idx - 10 + ord('A')
 elif char_idx < 62:
 char_code = char_idx - 36 + ord('a')
 elif char_idx == 62:
 char_code = ord('_')
 else:
 raise ValueError('error')
 text.append(chr(char_code))
 return "".join(text)
"""
#向量(大小MAX_CAPTCHA*CHAR_SET_LEN)用0,1编码 每63个编码一个字符,这样顺利有,字符也有
vec = text2vec("F5Sd")
text = vec2text(vec)
print(text) # F5Sd
vec = text2vec("SFd5")
text = vec2text(vec)
print(text) # SFd5
"""
# 生成一个训练batch
def get_next_batch(batch_size=128):
 batch_x = np.zeros([batch_size, IMAGE_HEIGHT * IMAGE_WIDTH])
 batch_y = np.zeros([batch_size, MAX_CAPTCHA * CHAR_SET_LEN])
 # 有时生成图像大小不是(60, 160, 3)
 def wrap_gen_captcha_text_and_image():
 while True:
 text, image = gen_captcha_text_and_image()
 if image.shape == (60, 160, 3):
 return text, image
 for i in range(batch_size):
 text, image = wrap_gen_captcha_text_and_image()
 image = convert2gray(image)
 batch_x[i, :] = image.flatten() / 255 # (image.flatten()-128)/128 mean为0
 batch_y[i, :] = text2vec(text)
 return batch_x, batch_y
####################################################################
X = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT * IMAGE_WIDTH])
Y = tf.placeholder(tf.float32, [None, MAX_CAPTCHA * CHAR_SET_LEN])
keep_prob = tf.placeholder(tf.float32) # dropout
# 定义CNN
def crack_captcha_cnn(w_alpha=0.01, b_alpha=0.1):
 x = tf.reshape(X, shape=[-1, IMAGE_HEIGHT, IMAGE_WIDTH, 1])
 # w_c1_alpha = np.sqrt(2.0/(IMAGE_HEIGHT*IMAGE_WIDTH)) #
 # w_c2_alpha = np.sqrt(2.0/(3*3*32))
 # w_c3_alpha = np.sqrt(2.0/(3*3*))
 # w_d1_alpha = np.sqrt(2.0/(8*32*))
 # out_alpha = np.sqrt(2.0/1024)
 # 3 conv layer
 w_c1 = tf.Variable(w_alpha * tf.random_normal([3, 3, 1, 32]))
 b_c1 = tf.Variable(b_alpha * tf.random_normal([32]))
 conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1))
 conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 conv1 = tf.nn.dropout(conv1, keep_prob)
 w_c2 = tf.Variable(w_alpha * tf.random_normal([3, 3, 32, ]))
 b_c2 = tf.Variable(b_alpha * tf.random_normal([]))
 conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2, strides=[1, 1, 1, 1], padding='SAME'), b_c2))
 conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 conv2 = tf.nn.dropout(conv2, keep_prob)
 w_c3 = tf.Variable(w_alpha * tf.random_normal([3, 3, , ]))
 b_c3 = tf.Variable(b_alpha * tf.random_normal([]))
 conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, w_c3, strides=[1, 1, 1, 1], padding='SAME'), b_c3))
 conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 conv3 = tf.nn.dropout(conv3, keep_prob)
 # Fully connected layer
 w_d = tf.Variable(w_alpha * tf.random_normal([8 * 32 * 40, 1024]))
 b_d = tf.Variable(b_alpha * tf.random_normal([1024]))
 dense = tf.reshape(conv3, [-1, w_d.get_shape().as_list()[0]])
 dense = tf.nn.relu(tf.add(tf.matmul(dense, w_d), b_d))
 dense = tf.nn.dropout(dense, keep_prob)
 w_out = tf.Variable(w_alpha * tf.random_normal([1024, MAX_CAPTCHA * CHAR_SET_LEN]))
 b_out = tf.Variable(b_alpha * tf.random_normal([MAX_CAPTCHA * CHAR_SET_LEN]))
 out = tf.add(tf.matmul(dense, w_out), b_out)
 # out = tf.nn.softmax(out)
 return out
# 训练
def train_crack_captcha_cnn():
 import time
 start_time=time.time()
 output = crack_captcha_cnn()
 # loss
 #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output, Y))
 loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=output, labels=Y))
 # 最后一层用来分类的softmax和sigmoid有什么不同?
 # optimizer 为了加快训练 learning_rate应该开始大,然后慢慢衰
 optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
 predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN])
 max_idx_p = tf.argmax(predict, 2)
 max_idx_l = tf.argmax(tf.reshape(Y, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
 correct_pred = tf.equal(max_idx_p, max_idx_l)
 accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
 saver = tf.train.Saver()
 with tf.Session() as sess:
 sess.run(tf.global_variables_initializer())
 step = 0
 while True:
 batch_x, batch_y = get_next_batch()
 _, loss_ = sess.run([optimizer, loss], feed_dict={X: batch_x, Y: batch_y, keep_prob: 0.75})
 print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())),step, loss_
 # 每100 step计算一次准确率
 if step % 100 == 0:
 batch_x_test, batch_y_test = get_next_batch(100)
 acc = sess.run(accuracy, feed_dict={X: batch_x_test, Y: batch_y_test, keep_prob: 1.})
 print u'***************************************************************第%s次的准确率为%s'%(step, acc)
 # 如果准确率大于50%,保存模型,完成训练
 if acc > 0.9: ##我这里设了0.9,设得越大训练要花的时间越长,如果设得过于接近1,很难达到。如果使用cpu,花的时间很长,cpu占用很高电脑发烫。
 saver.save(sess, "crack_capcha.model", global_step=step)
 print time.time()-start_time
 break
 step += 1
train_crack_captcha_cnn()

测试代码:

output = crack_captcha_cnn()
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, tf.train.latest_checkpoint('.'))
while(1):
 
 text, image = gen_captcha_text_and_image()
 image = convert2gray(image)
 image = image.flatten() / 255
 predict = tf.argmax(tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
 text_list = sess.run(predict, feed_dict={X: [image], keep_prob: 1})
 predict_text = text_list[0].tolist()
 vector = np.zeros(MAX_CAPTCHA * CHAR_SET_LEN)
 i = 0
 for t in predict_text:
 vector[i * 63 + t] = 1
 i += 1
 # break
 print("正确: {} 预测: {}".format(text, vec2text(vector)))

如果想要快点测试代码效果,验证码的字符不要设置太多,例如0123这几个数字就可以了。

相信看了本文案例你已经掌握了方法,更多精彩请关注Gxl网其它相关文章!

推荐阅读:

Python怎么实现马氏距离

python怎么批量读取txt文件为DataFrame格式

下载本文
显示全文
专题