视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
Python制作爬虫抓取美女图
2020-11-27 14:17:41 责编:小采
文档
   作为一个新世纪有思想有文化有道德时刻准备着的屌丝男青年,在现在这样一个社会中,心疼我大慢播抵制大百度的前提下,没事儿上上网逛逛YY看看斗鱼翻翻美女图片那是必不可少的,可是美图虽多翻页费劲!今天我们就搞个爬虫把美图都给扒下来!本次实例有2个:煎蛋上的妹子图,某网站的rosi图。我只是一个学习python的菜鸟,技术不可耻,技术是无罪的!!!

  煎蛋:

  先说说程序的流程:获取煎蛋妹子图URL,得到网页代码,提取妹子图片地址,访问图片地址并将图片保存到本地。Ready? 先让我们看看煎蛋妹子网页:

  我们得到URL为:http://jandan.net/ooxx/page-17#comments 17就是页码, 首先我们要得到最新的页码,然后向前寻找,然后得到每页中图片的url。下面我们分析网站代码写出正则表达式!

  根据之前文章的方法我们写出如下函数getNewPage:

def __getNewPage(self):
 pageCode = self.Get(self.__Url)
 type = sys.getfilesystemencoding()
 pattern = re.compile(r'.*?[https://www.gxlcms.com/(.*?)]',re.S)
 newPage = re.search(pattern,pageCode.decode("UTF-8").encode(type))
 print pageCode.decode("UTF-8").encode(type)
 if newPage != None:
 return newPage.group(1)
 return 1500

  不要问我为什么如果失败返回1500。。 因为煎蛋把1500页之前的图片都给吃了。 你也可以返回0。接下来是图片的

def __getAllPicUrl(self,pageIndex):
 realurl = self.__Url + "page-" + str(pageIndex) + "#comments"
 pageCode = self.Get(realurl)
 type = sys.getfilesystemencoding()
 pattern = re.compile('

.*?.*?.*?',re.S) items = re.findall(pattern,pageCode.decode("UTF-8").encode(type)) for item in items: print item

  好了,得到了图片地址,接下来就是访问图片地址然后保存图片了:

def __savePics(self,img_addr,folder):
 for item in img_addr:
 filename = item.split('/')[-1]
 print "正在保存图片:" + filename
 with open(filename,'wb') as file:
 img = self.Get(item)
 file.write(img)

  当你觉得信心满满的时候,一定会有一盆冷水浇到你的头上,毕竟程序就是这样,考验你的耐性,打磨你的自信。你测试了一会儿,然后你发现你重启程序后再也无法获取最新页码,你觉得我什么也没动啊为什么会这样。别着急,我们将得到的网页代码打印出来看看:

  看到了吧,是服务器感觉你不像浏览器访问的结果把你的ip给屏蔽了。 真是给跪了,辛辛苦苦码一年,屏蔽回到前!那么这个如何解决呢,答:换ip 找代理。接下来我们要改一下我们的HttpClient.py 将里面的opener设置下代理服务器。具体代理服务器请自行百度之,关键字:http代理 。 想找到一个合适的代理也不容易 自己ie Internet选项挨个试试,测试下网速。

# -*- coding: utf-8 -*-
import cookielib, urllib, urllib2, socket
import zlib,StringIO
class HttpClient:
 __cookie = cookielib.CookieJar()
 __proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'})#设置代理服务器与端口
 __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie),__proxy_handler)#生成opener
 __req.addheaders = [
 ('Accept', 'application/javascript, */*;q=0.8'),
 ('User-Agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW; Trident/5.0)')
 ]
 urllib2.install_opener(__req)

 def Get(self, url, refer=None):
 try:
 req = urllib2.Request(url)
 #req.add_header('Accept-encoding', 'gzip')
 if not (refer is None):
 req.add_header('Referer', refer)
 response = urllib2.urlopen(req, timeout=120)
 html = response.read()
 #gzipped = response.headers.get('Content-Encoding')
 #if gzipped:
 # html = zlib.decompress(html, 16+zlib.MAX_WBITS)
 return html
 except urllib2.HTTPError, e:
 return e.read()
 except socket.timeout, e:
 return ''
 except socket.error, e:
 return ''

  然后,就可以非常愉快的查看图片了。不过用了代理速度好慢。。可以设置timeout稍微长一点儿,防止图片下载不下来!

  好了,rosi的下篇文章再放!现在是时候上一波代码了:

# -*- coding: utf-8 -*-
import cookielib, urllib, urllib2, socket
import zlib,StringIO
class HttpClient:
 __cookie = cookielib.CookieJar()
 __proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'})
 __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie),__proxy_handler)
 __req.addheaders = [
 ('Accept', 'application/javascript, */*;q=0.8'),
 ('User-Agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW; Trident/5.0)')
 ]
 urllib2.install_opener(__req)

 def Get(self, url, refer=None):
 try:
 req = urllib2.Request(url)
 req.add_header('Accept-encoding', 'gzip')
 if not (refer is None):
 req.add_header('Referer', refer)
 response = urllib2.urlopen(req, timeout=120)
 html = response.read()
 gzipped = response.headers.get('Content-Encoding')
 if gzipped:
 html = zlib.decompress(html, 16+zlib.MAX_WBITS)
 return html
 except urllib2.HTTPError, e:
 return e.read()
 except socket.timeout, e:
 return ''
 except socket.error, e:
 return ''

 def Post(self, url, data, refer=None):
 try:
 #req = urllib2.Request(url, urllib.urlencode(data))
 req = urllib2.Request(url,data)
 if not (refer is None):
 req.add_header('Referer', refer)
 return urllib2.urlopen(req, timeout=120).read()
 except urllib2.HTTPError, e:
 return e.read()
 except socket.timeout, e:
 return ''
 except socket.error, e:
 return ''

 def Download(self, url, file):
 output = open(file, 'wb')
 output.write(urllib2.urlopen(url).read())
 output.close()

# def urlencode(self, data):
# return urllib.quote(data)

 def getCookie(self, key):
 for c in self.__cookie:
 if c.name == key:
 return c.value
 return ''

 def setCookie(self, key, val, domain):
 ck = cookielib.Cookie(version=0, name=key, value=val, port=None, port_specified=False, domain=domain, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
 self.__cookie.set_cookie(ck)
#self.__cookie.clear() clean cookie
# vim : tabstop=2 shiftwidth=2 softtabstop=2 expandtab

HttpClient

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from HttpClient import HttpClient
import sys,re,os
class JianDan(HttpClient):
 def __init__(self):
 self.__pageIndex = 1500 #之前的图片被煎蛋吞了
 self.__Url = "http://jandan.net/ooxx/"
 self.__floder = "jiandan"
 def __getAllPicUrl(self,pageIndex):
 realurl = self.__Url + "page-" + str(pageIndex) + "#comments"
 pageCode = self.Get(realurl)
 type = sys.getfilesystemencoding()
 pattern = re.compile('

.*?.*?.*?',re.S) items = re.findall(pattern,pageCode.decode("UTF-8").encode(type)) for item in items: print item self.__savePics(items,self.__floder) def __savePics(self,img_addr,folder): for item in img_addr: filename = item.split('/')[-1] print "正在保存图片:" + filename with open(filename,'wb') as file: img = self.Get(item) file.write(img) def __getNewPage(self): pageCode = self.Get(self.__Url) type = sys.getfilesystemencoding() pattern = re.compile(r'.*?[https://www.gxlcms.com/(.*?)]',re.S) newPage = re.search(pattern,pageCode.decode("UTF-8").encode(type)) print pageCode.decode("UTF-8").encode(type) if newPage != None: return newPage.group(1) return 1500 def start(self): isExists=os.path.exists(self.__floder)#检测是否存在目录 print isExists if not isExists: os.mkdir(self.__floder) os.chdir(self.__floder) page = int(self.__getNewPage()) for i in range(self.__pageIndex,page): self.__getAllPicUrl(i) if __name__ == '__main__': jd = JianDan() jd.start() JianDan

下载本文
显示全文
专题