视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
python抓取网页中图片并保存到本地
2020-11-27 14:34:59 责编:小采
文档


在上篇文章给大家分享PHP源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。

#-*-coding:utf-8-*- 
import os
import uuid
import urllib2
import cookielib
'''获取文件后缀名'''
def get_file_extension(file): 
 return os.path.splitext(file)[1] 
'''創建文件目录,并返回该目录'''
def mkdir(path):
 # 去除左右两边的空格
 path=path.strip()
 # 去除尾部 符号
 path=path.rstrip("\")
 if not os.path.exists(path):
 os.makedirs(path)
 return path
'''自动生成一个唯一的字符串,固定长度为36'''
def unique_str():
 return str(uuid.uuid1())
'''
抓取网页文件内容,保存到内存
@url 欲抓取文件 ,path+filename
'''
def get_file(url):
 try:
 cj=cookielib.LWPCookieJar()
 opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
 urllib2.install_opener(opener)
 req=urllib2.Request(url)
 operate=opener.open(req)
 data=operate.read()
 return data
 except BaseException, e:
 print e
 return None
'''
保存文件到本地
@path 本地路径
@file_name 文件名
@data 文件内容
'''
def save_file(path, file_name, data):
 if data == None:
 return
 mkdir(path)
 if(not path.endswith("/")):
 path=path+"/"
 file=open(path+file_name, "wb")
 file.write(data)
 file.flush()
 file.close()
#获取文件后缀名
print get_file_extension("123.jpg");
#創建文件目录,并返回该目录
#print mkdir("d:/ljq")
#自动生成一个唯一的字符串,固定长度为36
print unique_str()
url="http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0";
save_file("d:/ljq/", "123.jpg", get_file(url))

通过Python抓取指定Url中的图片保存至本地

# *** encoding: utf-8 ***
__author__='jiangyt'
""" 
fetch images from specific url
v1.0
""" 
import urllib, httplib, urlparse 
import re 
import random 
"""judge url exists or not""" 
def httpExists(url): 
 host, path = urlparse.urlsplit(url)[1:3] 
 if ':' in host: 
 # port specified, try to use it 
 host, port = host.split(':', 1) 
 try: 
 port = int(port) 
 except ValueError: 
 print 'invalid port number %r' % (port,) 
 return False 
 else: 
 # no port specified, use default port 
 port = None 
 try: 
 connection = httplib.HTTPConnection(host, port=port) 
 connection.request("HEAD", path) 
 resp = connection.getresponse( ) 
 if resp.status == 200: # normal 'found' status 
 found = True 
 elif resp.status == 302: # recurse on temporary redirect 
 found = httpExists(urlparse.urljoin(url,resp.getheader('location', ''))) 
 else: # everything else -> not found 
 print "Status %d %s : %s" % (resp.status, resp.reason, url) 
 found = False 
 except Exception, e: 
 print e.__class__, e, url 
 found = False 
 return found 
"""get html src,return lines[]""" 
def gGetHtmlLines(url): 
 if url==None : return 
 if not httpExists(url): return 
 try: 
 page = urllib.urlopen(url) 
 html = page.readlines() 
 page.close() 
 return html 
 except Exception, e: 
 print "gGetHtmlLines() error! Exception ==>>" + e 
 return 
"""get html src,return string""" 
def gGetHtml(url): 
 if url==None : return 
 if not httpExists(url): return 
 try: 
 page = urllib.urlopen(url) 
 html = page.read() 
 page.close() 
 return html 
 except Exception, e: 
 print "gGetHtml() error! Exception ==>>" + e 
 return 
"""根据url获取文件名""" 
def gGetFileName(url): 
 if url==None: return None 
 if url=="" : return "" 
 arr=url.split("/") 
 return arr[len(arr)-1] 
"""生成随机文件名""" 
def gRandFilename(type): 
 fname = '' 
 for i in range(16): 
 fname = fname + chr(random.randint(65,90)) 
 fname = fname + chr(random.randint(48,57)) 
 return fname + '.' + type 
"""根据url和其上的link,得到link的绝对地址""" 
def gGetAbslLink(url,link): 
 if url==None or link == None : return 
 if url=='' or link=='' : return url 
 addr = '' 
 if link[0] == '/' : 
 addr = gGetHttpAddr(url) + link 
 elif len(link)>3 and link[0:4] == 'http': 
 addr = link 
 elif len(link)>2 and link[0:2] == '..': 
 addr = gGetHttpAddrFatherAssign(url,link) 
 else: 
 addr = gGetHttpAddrFather(url) + link 
 return addr 
"""根据输入的lines,匹配正则表达式,返回list""" 
def gGetRegList(linesList,regx): 
 if linesList==None : return 
 rtnList=[] 
 for line in linesList: 
 matchs = re.search(regx, line, re.IGNORECASE) 
 if matchs!=None: 
 allGroups = matchs.groups() 
 for foundStr in allGroups: 
 if foundStr not in rtnList: 
 rtnList.append(foundStr) 
 return rtnList 
"""根据url下载文件,文件名参数指定""" 
def gDownloadWithFilename(url,savePath,file): 
 #参数检查,现忽略 
 try: 
 urlopen=urllib.URLopener() 
 fp = urlopen.open(url) 
 data = fp.read() 
 fp.close() 
 file=open(savePath + file,'w+b') 
 file.write(data) 
 file.close() 
 except IOError, error: 
 print "DOWNLOAD %s ERROR!==>>%s" % (url, error) 
 except Exception, e: 
 print "Exception==>>" + e 
"""根据url下载文件,文件名自动从url获取""" 
def gDownload(url,savePath): 
 #参数检查,现忽略 
 fileName = gGetFileName(url) 
 #fileName =gRandFilename('jpg') 
 gDownloadWithFilename(url,savePath,fileName) 
"""根据某网页的url,下载该网页的jpg""" 
def gDownloadHtmlJpg(downloadUrl,savePath): 
 lines= gGetHtmlLines(downloadUrl) # 'get the page source' 
 regx = r"""srcs*="?(S+).jpg""" 
 lists =gGetRegList(lines,regx) #'get the links which match regular express' 
 if lists==None: return 
 for jpg in lists: 
 jpg = gGetAbslLink(downloadUrl, jpg) + '.jpg' 
 gDownload(jpg,savePath) 
 print gGetFileName(jpg) 
"""根据url取主站地址""" 
def gGetHttpAddr(url): 
 if url== '' : return '' 
 arr=url.split("/") 
 return arr[0]+"//"+arr[2] 
"""根据url取上级目录""" 
def gGetHttpAddrFather(url): 
 if url=='' : return '' 
 arr=url.split("/") 
 addr = arr[0]+'//'+arr[2]+ '/' 
 if len(arr)-1>3 : 
 for i in range(3,len(arr)-1): 
 addr = addr + arr[i] + '/' 
 return addr 
"""根据url和上级的link取link的绝对地址""" 
def gGetHttpAddrFatherAssign(url,link): 
 if url=='' : return '' 
 if link=='': return '' 
 linkArray=link.split("/") 
 urlArray = url.split("/") 
 partLink ='' 
 partUrl = '' 
 for i in range(len(linkArray)): 
 if linkArray[i]=='..': 
 numOfFather = i + 1 #上级数 
 else: 
 partLink = partLink + '/' + linkArray[i] 
 for i in range(len(urlArray)-1-numOfFather): 
 partUrl = partUrl + urlArray[i] 
 if i < len(urlArray)-1-numOfFather -1 : 
 partUrl = partUrl + '/' 
 return partUrl + partLink 
"""根据url获取其上的相关htm、html链接,返回list""" 
def gGetHtmlLink(url): 
 #参数检查,现忽略 
 rtnList=[] 
 lines=gGetHtmlLines(url) 
 regx = r"""href="?(S+).htm""" 
 for link in gGetRegList(lines,regx): 
 link = gGetAbslLink(url,link) + '.htm' 
 if link not in rtnList: 
 rtnList.append(link) 
 print link 
 return rtnList 
"""根据url,抓取其上的jpg和其链接htm上的jpg""" 
def gDownloadAllJpg(url,savePath): 
 #参数检查,现忽略 
 gDownloadHtmlJpg(url,savePath) 
 #抓取link上的jpg 
 links=gGetHtmlLink(url) 
 for link in links: 
 gDownloadHtmlJpg(link,savePath) 
"""test""" 
def main(): 
 u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址
 save='/root/python/tmp/' #图片所要存放的目录
 print 'download pic from [' + u +']' 
 print 'save to [' +save+'] ...' 
 gDownloadHtmlJpg(u,save) 
 print "download finished" 
if __name__ == "__main__":
 main()
else:
 print "called from intern."

以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。

下载本文
显示全文
专题