视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
python如何爬取搜狗微信公众号文章永久链接的思路解析
2020-11-27 14:13:08 责编:小OO
文档


这篇文章主要介绍了python如何爬取搜狗微信公众号文章永久链接的思路解析 ,小编觉得挺不错的,现在分享给大家,也给大家做个参考。一起跟随小编过来看看吧。

本文主要讲解思路,代码部分请自行解决

  • 搜狗微信搜素获取公众号及文章

  • 通过微信公众平台获取永久链接

  • python+scrapy框架

  • mysql数据库存储+读取公众号


  • 获取搜狗微信上当天的信息排名

    指定输入关键字,通过scrapy抓取公众号

    通过登陆微信公众号链接,获取cookie信息

    由于模拟登陆微信公众平台还未解决,因此需要手动登陆,实时获取cookie信息




    在这里可以实现转变永久链接

    代码部分

    def parse(self, response):
     item = SougouItem()
     item["title"] = response.xpath('//title/text()').extract_first()
     print("**"*5, item["title"],"**"*5)
     name = input("----------请输入需要搜索的信息:")
     print(name)
     url = "http://weixin.sogou.com/weixin?query="+name+"&type=2&page=1&ie=utf8"
     yield scrapy.Request(url=url, callback=self.parse_two, meta={"name":name})

    搜狗微信中会出现访问频率过快,导致需要输入验证码

     def parse_two(self, response):
     print(response.url)
     name = response.meta["name"]
     resp = response.xpath('//ul[@class="news-list"]/li')
     s = 1
     # 判断url 是否是需要输入验证码
     res = re.search("from", response.url) # 需要验证码验证
     if res:
     print(response.url)
     img = response.xpath('//img/@src').extract()
     print(img)
     url_img = "http://weixin.sogou.com/antispider/"+ img[1]
     print(url_img)
     url_img = requests.get(url_img).content with open("urli.jpg", "wb") as f:
     f.write(url_img) # f.close()
     img = input("请输入验证码:")
     print(img)
     url = response.url
     r = re.search(r"from=(.*)",url).group(1)
     print(r)
     postData = {"c":img,"r":r,"v":"5"}
     url = "http://weixin.sogou.com/antispider/thank.php"
     yield scrapy.FormRequest(url=url, formdata=postData, callback=self.parse_two,meta={"name":name}) 
     # 不需要验证码验证
     else: 
     for res, i in zip(resp, range(1, 10)):
     item = SougouItem()
     item["url"] = res.xpath('.//p[1]/a/@href').extract_first()
     item["name"] = name
     print("第%d条" % i) # 转化永久链接
     headers = {"Host": "mp.weixin.qq.com", 
     "Connection": "keep-alive", 
     "Accept": "application/json, text/javascript, */*; q=0.01", 
     "X-Requested-With": "XMLHttpRequest", 
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", 
     "Referer": "https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=9349250&lang=zh_CN", 
     "Accept-Encoding": "gzip, deflate, br", 
     "Accept-Language": "zh-CN,zh;q=0.9", 
     "Cookie": "noticeLoginFlag=1; pgv_pvi=5269297152; pt2gguin=o1349184918; RK=ph4smy/QWu; ptcz=f3eb6ede5db921d0ada7f1713e6d1ca516d200fec57d602e677245490fcb7f1e; pgv_pvid=1033302674; o_cookie=1349184918; pac_uid=1_1349184918; ua_id=4nooSvHNkTOjpIpgAAAAAFX9OSNcLApfsluzwfClLW8=; mm_lang=zh_CN; noticeLoginFlag=1; remember_acct=Liangkai318; rewardsn=; wxtokenkey=777; pgv_si=s1944231936; uuid=700c40c965347f0925a8e8fdcc1e003e; ticket=023fc8861356b01527983c2c4765ef80903bf3d7; ticket_id=gh_6923d82780e4; cert=L_cE4aRdaZeDnzao3xEbMkcP3Kwuejoi; data_bizuin=3075391054; bizuin=3208078327; data_ticket=XrzOnrV9Odc80hJLtk8vFjTLI1vd7kfKJ9u+DzvaeeHxZkMXbv9kcWk/Pmqx/9g7; slave_sid=SWRKNmFyZ1NkM002Rk9NR0RRVGY5VFdMd1lXSkExWGtPcWJaREkzQ1BESEcyQkNLVlQ3YnB4OFNoNmtRZzdFdGpnVGlHak9LMjJ5eXBNVEgxZDlZb1BZMnlfN1hKdnJsV0NKallsQW91Zjk5Y3prVjlQRDNGYUdGUWNFNEd6eTRYT1FSOEQxT0MwR01Ja0Vo; slave_user=gh_6923d82780e4; xid=7b2245140217dbb3c5c0a552d46b96; openid2ticket_oTr5Ot_B4nrDSj14zUxlXg8yrzws=D/B6//xK73BoO+mKE2EAjdcgIXNPw/b5PEDTDWM6t+4="}
     respon = requests.get(url=item["url"]).content
     gongzhongh = etree.HTML(respon).xpath('//a[@id="post-user"]/text()')[0] 
     # times = etree.HTML(respon).xpath('//*[@id="post-date"]/text()')[0]
     title_one = etree.HTML(respon).xpath('//*[@id="activity-name"]/text()')[0].split()[0]
     print(gongzhongh, title_one)
     item["tit"] = title_one
     item["gongzhongh"] = gongzhongh 
     # item["times"] = times
     url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&token=9349250&lang=zh_CN&f=json&ajax=1&query=" + gongzhongh + "&begin=0&count=5"
     # wenzhang_url = "https://mp.weixin.qq.com/cgi-bin/appmsg?token=610084158&lang=zh_CN&f=json&ajax=1&random=0.7159556076774083&action=list_ex&begin=0&count=5&query=" + item["tit"] + "&fakeid=MzA5MzMxMDk3OQ%3D%3D&type=9"
     resp = requests.get(url=url, headers=headers).content
     print(resp)
     faskeids = json.loads(resp.decode("utf-8")) 
     try:
     list_fask = faskeids["list"] except Exception as f:
     print("**********[INFO]:请求失败,登陆失败, 请重新登陆*************") 
     return
     for fask in list_fask:
     fakeid = fask["fakeid"]
     nickname = fask["nickname"] if nickname == item["gongzhongh"]:
     url = "https://mp.weixin.qq.com/cgi-bin/appmsg?token=9349250&f=json&action=list_ex&count=5&query=&fakeid=" + fakeid + "&type=9"
     # url = "https://mp.weixin.qq.com/cgi-bin/appmsg?token=1773340085&lang=zh_CN&f=json&ajax=1&action=list_ex&begin=0&count=5&query=" + item["tit"] + "&fakeid=MzA5MzMxMDk3OQ%3D%3D&type=9"
     url = "https://mp.weixin.qq.com/cgi-bin/appmsg?token=9349250&f=json&ajax=1&action=list_ex&begin=0&count=5&query=" + item["tit"] +"&fakeid=" + fakeid +"&type=9"
     resp = requests.get(url=url, headers=headers).content
     app = json.loads(resp.decode("utf-8"))["app_msg_list"]
     item["aid"] = app["aid"]
     item["appmsgid"] = app["appmsgid"]
     item["cover"] = app["cover"]
     item["digest"] = app["digest"]
     item["url_link"] = app["link"]
     item["tit"] = app["title"]
     print(item)
     time.sleep(10) # time.sleep(5)
     # dict_wengzhang = json.loads(resp.decode("utf-8"))
     # app_msg_list = dict_wengzhang["app_msg_list"]
     # print(len(app_msg_list))
     # for app in app_msg_list:
     # print(app)
     # title = app["title"]
     # if title == item["tit"]:
     # item["url_link"] = app["link"]
     # updata_time = app["update_time"]
     # item["times"] = time.strftime("%Y-%m-%d %H:%M:%S", updata_time)
     # print("最终链接为:", item["url_link"])
     # yield item
     # else:
     # print(app["title"], item["tit"])
     # print("与所选文章不同放弃")
     # # item["tit"] = app["title"]
     # # item["url_link"] = app["link"]
     # # yield item
     # else:
     # print(nickname, item["gongzhongh"])
     # print("与所选公众号不一致放弃")
     # time.sleep(100)
     # yield item
     if response.xpath('//a[@class="np"]'):
     s += 1
     url = "http://weixin.sogou.com/weixin?query="+name+"&type=2&page="+str(s) # time.sleep(3)
     yield scrapy.Request(url=url, callback=self.parse_two, meta={"name": name})

    下载本文
    显示全文
    专题