视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
Python实现多线程抓取妹子图
2020-11-27 14:41:54 责编:小采
文档

心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。

Pic_downloader.py

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 07 17:30:58 2015
 
@author: Dreace
"""
import urllib2
import sys
import time
import os
import random
from multiprocessing.dummy import Pool as ThreadPool 
type_ = sys.getfilesystemencoding()
def rename():
 return time.strftime("%Y%m%d%H%M%S")
def rename_2(name): 
 if len(name) == 2: 
 name = '0' + name + '.jpg' 
 elif len(name) == 1: 
 name = '00' + name + '.jpg' 
 else: 
 name = name + '.jpg' 
 return name
def download_pic(i):
 global count
 global time_out
 if Filter(i):
 try: 
 content = urllib2.urlopen(i,timeout = time_out)
 url_content = content.read()
 f = open(repr(random.randint(10000,999999999)) + "_" + rename_2(repr(count)),"wb")
 f.write(url_content)
 f.close()
 count += 1
 except Exception, e:
 print i + "下载超时,跳过!".decode("utf-8").encode(type_)
def Filter(content):
 for line in Filter_list:
 line=line.strip('
')
 if content.find(line) == -1:
 return True
def get_pic(url_address):
 global pic_list
 try:
 str_ = urllib2.urlopen(url_address, timeout = time_out).read()
 url_content = str_.split(""")
 for i in url_content:
 if i.find(".jpg") != -1:
 pic_list.append(i) 
 except Exception, e:
 print "获取图片超时,跳过!".decode("utf-8").encode(type_)
MAX = 2
count = 0
time_out = 60
thread_num = 30
pic_list = []
page_list = []
Filter_list = ["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"]
dir_name = "C:Photos\"+rename()
os.makedirs(dir_name)
os.chdir(dir_name)
start_time = time.time()
url_address = "http://sexy.faceks.com/?page="
for i in range(1,MAX + 1): 
 page_list.append(url_address + repr(i))
page_pool = ThreadPool(thread_num)
page_pool.map(get_pic,page_list)
print "获取到".decode("utf-8").encode(type_),len(pic_list),"张图片,开始下载!".decode("utf-8").encode(type_)
pool = ThreadPool(thread_num) 
pool.map(download_pic,pic_list)
pool.close() 
pool.join()
print count,"张图片保存在".decode("utf-8").encode(type_) + dir_name
print "共耗时".decode("utf-8").encode(type_),time.time() - start_time,"s"

我们来看下一个网友的作品

#coding: utf-8 #############################################################
# File Name: main.py
# Author: mylonly
# mail: mylonly@gmail.com
# Created Time: Wed 11 Jun 2014 08:22:12 PM CST
#########################################################################
#!/usr/bin/python

import re,urllib2,HTMLParser,threading,Queue,time

#各图集入口链接
htmlDoorList = []
#包含图片的Hmtl链接
htmlUrlList = []
#图片Url链接Queue
imageUrlList = Queue.Queue(0)
#捕获图片数量
imageGetCount = 0
#已下载图片数量
imageDownloadCount = 0
#每个图集的起始地址,用于判断终止
nextHtmlUrl = ''
#本地保存路径
localSavePath = '/data/1920x1080/'

#如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800
replace_str = '1920x1080'

replaced_str = '960x600'

#内页分析处理类
class ImageHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.nextUrl = ''
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
global imageUrlList
if(tag == 'img' and len(attrs) > 2 ):
if(attrs[0] == ('id','bigImg')):
url = attrs[1][1]
url = url.replace(replaced_str,replace_str)
imageUrlList.put(url)
global imageGetCount
imageGetCount = imageGetCount + 1
print url
elif(tag == 'a' and len(attrs) == 4):
if(attrs[0] == ('id','pageNext') and attrs[1] == ('class','next')):
global nextHtmlUrl
nextHtmlUrl = attrs[2][1];

#首页分析类
class IndexHtmlParser(HTMLParser.HTMLParser):
def __init__(self):
self.urlList = []
self.index = 0
self.nextUrl = ''
self.tagList = ['li','a']
self.classList = ['photo-list-padding','pic']
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if(tag == self.tagList[self.index]):
for attr in attrs:
if (attr[1] == self.classList[self.index]):
if(self.index == 0):
#第一层找到了
self.index = 1
else:
#第二层找到了
self.index = 0
print attrs[1][1]
self.urlList.append(attrs[1][1])
break
elif(tag == 'a'):
for attr in attrs:
if (attr[0] == 'id' and attr[1] == 'pageNext'):
self.nextUrl = attrs[1][1]
print 'nextUrl:',self.nextUrl
break

#首页Hmtl解析器
indexParser = IndexHtmlParser()
#内页Html解析器
imageParser = ImageHtmlParser()

#根据首页得到所有入口链接
print '开始扫描首页...'
host = 'http://desk.zol.com.cn'
indexUrl = '/meinv/'
while (indexUrl != ''):
print '正在抓取网页:',host+indexUrl
request = urllib2.Request(host+indexUrl)
try:
m = urllib2.urlopen(request)
con = m.read()
indexParser.feed(con)
if (indexUrl == indexParser.nextUrl):
break
else:
indexUrl = indexParser.nextUrl
except urllib2.URLError,e:
print e.reason

print '首页扫描完成,所有图集链接已获得:'
htmlDoorList = indexParser.urlList

#根据入口链接得到所有图片的url
class getImageUrl(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
for door in htmlDoorList:
print '开始获取图片地址,入口地址为:',door
global nextHtmlUrl
nextHtmlUrl = ''
while(door != ''):
print '开始从网页%s获取图片...'% (host+door)
if(nextHtmlUrl != ''):
request = urllib2.Request(host+nextHtmlUrl)
else:
request = urllib2.Request(host+door)
try:
m = urllib2.urlopen(request)
con = m.read()
imageParser.feed(con)
print '下一个页面地址为:',nextHtmlUrl
if(door == nextHtmlUrl):
break
except urllib2.URLError,e:
print e.reason
print '所有图片地址均已获得:',imageUrlList

class getImage(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
global imageUrlList
print '开始下载图片...'
while(True):
print '目前捕获图片数量:',imageGetCount
print '已下载图片数量:',imageDownloadCount
image = imageUrlList.get()
print '下载文件路径:',image
try:
cont = urllib2.urlopen(image).read()
patter = '[0-9]*.jpg';
match = re.search(patter,image);
if match:
print '正在下载文件:',match.group()
filename = localSavePath+match.group()
f = open(filename,'wb')
f.write(cont)
f.close()
global imageDownloadCount
imageDownloadCount = imageDownloadCount + 1
else:
print 'no match'
if(imageUrlList.empty()):
break
except urllib2.URLError,e:
print e.reason
print '文件全部下载完成...'

get = getImageUrl()
get.start()
print '获取图片链接线程启动:'

time.sleep(2)

download = getImage()
download.start()
print '下载图片链接线程启动:'


批量抓取指定网页上的所有图片

# -*- coding:utf-8 -*-
# coding=UTF-8
 
import os,urllib,urllib2,re
 
url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1"
outpath = "t:\"
 
def getHtml(url):
 webfile = urllib.urlopen(url)
 outhtml = webfile.read()
 print outhtml
 return outhtml
 
def getImageList(html):
 restr=ur'('
 restr+=ur'http://[^s,"]*.jpg'
 restr+=ur'|http://[^s,"]*.jpeg'
 restr+=ur'|http://[^s,"]*.png'
 restr+=ur'|http://[^s,"]*.gif'
 restr+=ur'|http://[^s,"]*.bmp'
 restr+=ur'|https://[^s,"]*.jpeg' 
 restr+=ur'|https://[^s,"]*.jpeg'
 restr+=ur'|https://[^s,"]*.png'
 restr+=ur'|https://[^s,"]*.gif'
 restr+=ur'|https://[^s,"]*.bmp'
 restr+=ur')'
 htmlurl = re.compile(restr)
 imgList = re.findall(htmlurl,html)
 print imgList
 return imgList
 
def download(imgList, page):
 x = 1
 for imgurl in imgList:
 filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()
 print '[Debug] Download file :'+ imgurl+' >> '+filepathname
 urllib.urlretrieve(imgurl,filepathname)
 x+=1
 
def downImageNum(pagenum):
 page = 1
 pageNumber = pagenum
 while(page <= pageNumber):
 html = getHtml(url)#获得url指向的html内容
 imageList = getImageList(html)#获得所有图片的地址,返回列表
 download(imageList,page)#下载所有的图片
 page = page+1
 
if __name__ == '__main__':
 downImageNum(1)

下载本文
显示全文
专题