python抓取网页中图片并保存到本地_动视

python抓取网页中图片并保存到本地

2020-11-27 14:34:59 责编:小采

在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法，感兴趣的朋友可以点击了解详情。

#-*-coding:utf-8-*- 
import os
import uuid
import urllib2
import cookielib
'''获取文件后缀名'''
def get_file_extension(file): 
 return os.path.splitext(file)[1] 
'''創建文件目录，并返回该目录'''
def mkdir(path):
 # 去除左右两边的空格
 path=path.strip()
 # 去除尾部 符号
 path=path.rstrip("\")
 if not os.path.exists(path):
 os.makedirs(path)
 return path
'''自动生成一个唯一的字符串，固定长度为36'''
def unique_str():
 return str(uuid.uuid1())
'''
抓取网页文件内容，保存到内存
@url 欲抓取文件 ，path+filename
'''
def get_file(url):
 try:
 cj=cookielib.LWPCookieJar()
 opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 urllib2.install_opener(opener)
 req=urllib2.Request(url)
 operate=opener.open(req)
 data=operate.read()
 return data
 except BaseException, e:
 print e
 return None
'''
保存文件到本地
@path 本地路径
@file_name 文件名
@data 文件内容
'''
def save_file(path, file_name, data):
 if data == None:
 return
 mkdir(path)
 if(not path.endswith("/")):
 path=path+"/"
 file=open(path+file_name, "wb")
 file.write(data)
 file.flush()
 file.close()
#获取文件后缀名
print get_file_extension("123.jpg");
#創建文件目录，并返回该目录
#print mkdir("d:/ljq")
#自动生成一个唯一的字符串，固定长度为36
print unique_str()
url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0";
save_file("d:/ljq/", "123.jpg", get_file(url))

通过Python抓取指定Url中的图片保存至本地

# *** encoding: utf-8 ***
__author__='jiangyt'
""" 
fetch images from specific url
v1.0
""" 
import urllib, httplib, urlparse 
import re 
import random 
"""judge url exists or not""" 
def httpExists(url): 
 host, path = urlparse.urlsplit(url)[1:3] 
 if ':' in host: 
 # port specified, try to use it 
 host, port = host.split(':', 1) 
 try: 
 port = int(port) 
 except ValueError: 
 print 'invalid port number %r' % (port,) 
 return False 
 else: 
 # no port specified, use default port 
 port = None 
 try: 
 connection = httplib.HTTPConnection(host, port=port) 
 connection.request("HEAD", path) 
 resp = connection.getresponse( ) 
 if resp.status == 200: # normal 'found' status 
 found = True 
 elif resp.status == 302: # recurse on temporary redirect 
 found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) 
 else: # everything else -> not found 
 print "Status %d %s : %s" % (resp.status, resp.reason, url) 
 found = False 
 except Exception, e: 
 print e.__class__, e, url 
 found = False 
 return found 
"""get html src,return lines[]""" 
def gGetHtmlLines(url): 
 if url==None : return 
 if not httpExists(url): return 
 try: 
 page = urllib.urlopen(url) 
 html = page.readlines() 
 page.close() 
 return html 
 except Exception, e: 
 print "gGetHtmlLines() error! Exception ==>>" + e 
 return 
"""get html src,return string""" 
def gGetHtml(url): 
 if url==None : return 
 if not httpExists(url): return 
 try: 
 page = urllib.urlopen(url) 
 html = page.read() 
 page.close() 
 return html 
 except Exception, e: 
 print "gGetHtml() error! Exception ==>>" + e 
 return 
"""根据url获取文件名""" 
def gGetFileName(url): 
 if url==None: return None 
 if url=="" : return "" 
 arr=url.split("/") 
 return arr[len(arr)-1] 
"""生成随机文件名""" 
def gRandFilename(type): 
 fname = '' 
 for i in range(16): 
 fname = fname + chr(random.randint(65,90)) 
 fname = fname + chr(random.randint(48,57)) 
 return fname + '.' + type 
"""根据url和其上的link，得到link的绝对地址""" 
def gGetAbslLink(url,link): 
 if url==None or link == None : return 
 if url=='' or link=='' : return url 
 addr = '' 
 if link[0] == '/' : 
 addr = gGetHttpAddr(url) + link 
 elif len(link)>3 and link[0:4] == 'http': 
 addr = link 
 elif len(link)>2 and link[0:2] == '..': 
 addr = gGetHttpAddrFatherAssign(url,link) 
 else: 
 addr = gGetHttpAddrFather(url) + link 
 return addr 
"""根据输入的lines，匹配正则表达式，返回list""" 
def gGetRegList(linesList,regx): 
 if linesList==None : return 
 rtnList=[] 
 for line in linesList: 
 matchs = re.search(regx, line, re.IGNORECASE) 
 if matchs!=None: 
 allGroups = matchs.groups() 
 for foundStr in allGroups: 
 if foundStr not in rtnList: 
 rtnList.append(foundStr) 
 return rtnList 
"""根据url下载文件，文件名参数指定""" 
def gDownloadWithFilename(url,savePath,file): 
 #参数检查，现忽略 
 try: 
 urlopen=urllib.URLopener() 
 fp = urlopen.open(url) 
 data = fp.read() 
 fp.close() 
 file=open(savePath + file,'w+b') 
 file.write(data) 
 file.close() 
 except IOError, error: 
 print "DOWNLOAD %s ERROR!==>>%s" % (url, error) 
 except Exception, e: 
 print "Exception==>>" + e 
"""根据url下载文件，文件名自动从url获取""" 
def gDownload(url,savePath): 
 #参数检查，现忽略 
 fileName = gGetFileName(url) 
 #fileName =gRandFilename('jpg') 
 gDownloadWithFilename(url,savePath,fileName) 
"""根据某网页的url,下载该网页的jpg""" 
def gDownloadHtmlJpg(downloadUrl,savePath): 
 lines= gGetHtmlLines(downloadUrl) # 'get the page source' 
 regx = r"""srcs*="?(S+).jpg""" 
 lists =gGetRegList(lines,regx) #'get the links which match regular express' 
 if lists==None: return 
 for jpg in lists: 
 jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' 
 gDownload(jpg,savePath) 
 print gGetFileName(jpg) 
"""根据url取主站地址""" 
def gGetHttpAddr(url): 
 if url== '' : return '' 
 arr=url.split("/") 
 return arr[0]+"//"+arr[2] 
"""根据url取上级目录""" 
def gGetHttpAddrFather(url): 
 if url=='' : return '' 
 arr=url.split("/") 
 addr = arr[0]+'//'+arr[2]+ '/' 
 if len(arr)-1>3 : 
 for i in range(3,len(arr)-1): 
 addr = addr + arr[i] + '/' 
 return addr 
"""根据url和上级的link取link的绝对地址""" 
def gGetHttpAddrFatherAssign(url,link): 
 if url=='' : return '' 
 if link=='': return '' 
 linkArray=link.split("/") 
 urlArray = url.split("/") 
 partLink ='' 
 partUrl = '' 
 for i in range(len(linkArray)): 
 if linkArray[i]=='..': 
 numOfFather = i + 1 #上级数 
 else: 
 partLink = partLink + '/' + linkArray[i] 
 for i in range(len(urlArray)-1-numOfFather): 
 partUrl = partUrl + urlArray[i] 
 if i < len(urlArray)-1-numOfFather -1 : 
 partUrl = partUrl + '/' 
 return partUrl + partLink 
"""根据url获取其上的相关htm、html链接，返回list""" 
def gGetHtmlLink(url): 
 #参数检查，现忽略 
 rtnList=[] 
 lines=gGetHtmlLines(url) 
 regx = r"""href="?(S+).htm""" 
 for link in gGetRegList(lines,regx): 
 link = gGetAbslLink(url,link) + '.htm' 
 if link not in rtnList: 
 rtnList.append(link) 
 print link 
 return rtnList 
"""根据url，抓取其上的jpg和其链接htm上的jpg""" 
def gDownloadAllJpg(url,savePath): 
 #参数检查，现忽略 
 gDownloadHtmlJpg(url,savePath) 
 #抓取link上的jpg 
 links=gGetHtmlLink(url) 
 for link in links: 
 gDownloadHtmlJpg(link,savePath) 
"""test""" 
def main(): 
 u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址
 save='/root/python/tmp/' #图片所要存放的目录
 print 'download pic from [' + u +']' 
 print 'save to [' +save+'] ...' 
 gDownloadHtmlJpg(u,save) 
 print "download finished" 
if __name__ == "__main__":
 main()
else:
 print "called from intern."

以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容，希望大家喜欢。

下载本文

显示全文

全部频道