视频1 视频21 视频41 视频61 视频文章1 视频文章21 视频文章41 视频文章61 推荐1 推荐3 推荐5 推荐7 推荐9 推荐11 推荐13 推荐15 推荐17 推荐19 推荐21 推荐23 推荐25 推荐27 推荐29 推荐31 推荐33 推荐35 推荐37 推荐39 推荐41 推荐43 推荐45 推荐47 推荐49 关键词1 关键词101 关键词201 关键词301 关键词401 关键词501 关键词601 关键词701 关键词801 关键词901 关键词1001 关键词1101 关键词1201 关键词1301 关键词1401 关键词1501 关键词1601 关键词1701 关键词1801 关键词1901 视频扩展1 视频扩展6 视频扩展11 视频扩展16 文章1 文章201 文章401 文章601 文章801 文章1001 资讯1 资讯501 资讯1001 资讯1501 标签1 标签501 标签1001 关键词1 关键词501 关键词1001 关键词1501 专题2001
asp.net 抓取网页源码三种实现方法
2020-11-27 22:41:00 责编:小采
文档


方法1 比较推荐  

/// <summary> 
 
 /// 用HttpWebRequest取得网页源码 
 /// 对于带BOM的网页很有效,不管是什么编码都能正确识别 
 /// </summary> 
 /// <param name="url">网页地址" </param> 
 /// <returns>返回网页源文件</returns> 
 public static string GetHtmlSource2(string url)
 {
 //处理内容 
 string html = "";
 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
 request.Accept = "*/*"; //接受任意文件
 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)"; // 模拟使用IE在浏览 http://www.52mvc.com
 request.AllowAutoRedirect = true;//是否允许302
 //request.CookieContainer = new CookieContainer();//cookie容器,
 request.Referer = url; //当前页面的引用
 
 
 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
 Stream stream = response.GetResponseStream();
 StreamReader reader = new StreamReader(stream, Encoding.Default);
 html = reader.ReadToEnd();
 stream.Close();
 
 
 return html;
 }

方法2 

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.IO;
using System.Text;
using System.Net;

namespace MySql
{
 public class GetHttpData
 {
 public static string GetHttpData2(string Url)
 {
 string sException = null;
 string sRslt = null;
 WebResponse oWebRps = null;
 WebRequest oWebRqst = WebRequest.Create(Url);
 oWebRqst.Timeout = 50000;
 try
 {

 oWebRps = oWebRqst.GetResponse();

 }
 catch (WebException e)
 {
 sException = e.Message.ToString();
 }
 catch (Exception e)
 {
 sException = e.ToString();
 
 }
 finally
 {
 if (oWebRps != null)
 {
 
 StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), Encoding.GetEncoding("utf-8"));
 sRslt = oStreamRd.ReadToEnd();
 oStreamRd.Close();
 oWebRps.Close();
 }
 }
 
 return sRslt;
 }
 
 }
}

方法3

public static string getHtml(string url, params string [] charSets)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
 {
 try
 {
 string charSet = null;
 if (charSets.Length == 1) {
 charSet = charSets[0];
 }
 WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
 // 需要注意的:
 //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
 //这是就要具体问题具体分析比如在头部加入cookie
 // webclient.Headers.Add("Cookie", cookie);
 //这样可能需要一些重载方法。根据需要写就可以了
 
 
 //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
 myWebClient.Credentials = CredentialCache.DefaultCredentials;
 //如果服务器要验证用户名,密码
 //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
 //myWebClient.Credentials = mycred;
 //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
 byte[] myDataBuffer = myWebClient.DownloadData(url);
 string strWebData = Encoding.Default.GetString(myDataBuffer);
 
 
 //获取网页字符编码描述信息
 Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
 string webCharSet = charSetMatch.Groups[2].Value;
 if (charSet == null || charSet == "")
 charSet = webCharSet;
 
 
 if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
 {
 strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
 }
 else {
 strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);
 }
 return strWebData;
 }
 catch (Exception e) { return ""; }
 }

asp.net 获取网页源文件的方法

有时候我们需要获取 网页源文件,所以用以下这个方法很容易完成任务!

private string GetStringByUrl(string strUrl) 
{ 
 WebRequest wrt = WebRequest.Create(strUrl); 
 WebResponse wrse = wrt.GetResponse(); 
 Stream strM = wrse.GetResponseStream(); 
 StreamReader SR = new StreamReader(strM, Encoding.GetEncoding("gb2312")); 
 string strallstrm = SR.ReadToEnd(); 
 return strallstrm; 
} 

只要传入要下载网页的地址就OK了!
通过这个方法做个源码导出:

private string SaveHTML() 
 { 
string str = RenderPage("Default2.aspx"); 
 Response.ContentEncoding = System.Text.Encoding.GetEncoding("UTF-8"); //解决中文乱码 
 Response.AddHeader("Content-Disposition","attachment;filename=index.html"); //解决中文文件名乱码 
 Response.AddHeader("Content-length",str.Length.ToString()); 
 Response.Write(str); 
 Response.End(); 
} 

下载本文
显示全文
专题