C#爬页面总结
2016-05-05 18:52
489 查看
错误的思路是这样的:发送一个访问页面的请求过去,得到一个html页面,然后我要的数据全都在这上面。后来发现不是这样的,也猜到可能是页面加载之后还有js代码的ajax的异步加载,那么问题来了?我是不是要等到这些ajax请求结束之后,我才能拿到数据呢?我怎么判断有没有结束?我要等多久合适呢?嗯,仔细向下,还有个问题是,发送的post请求过去,又没有浏览器渲染,谁去执行这些js代码呢?
实际上是这样的:发送一个访问页面的请求过去,上面可能有我要的数据,也可能没有,如果没有,那就看看是不是要发另外的请求。
1、相关工具和技巧
1.1 Chrome浏览器
打开浏览器,按F12进入调式状态,例如打开www.cnblogs.com
View Code
3 举个简单的例子
实际上是这样的:发送一个访问页面的请求过去,上面可能有我要的数据,也可能没有,如果没有,那就看看是不是要发另外的请求。
1、相关工具和技巧
1.1 Chrome浏览器
打开浏览器,按F12进入调式状态,例如打开www.cnblogs.com
using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Linq; using System.Net; using System.Reflection; using System.Text; using System.Threading; using System.Threading.Tasks; namespace Business { /// <summary> /// HttpCallHelper /// </summary> public class HttpCallHelper { /// <summary> /// post /// </summary> /// <param name="param">param</param> /// <returns>HttpResultInfo</returns> public static HttpResultInfo Post(HttpRequestParams param) { HttpResultInfo res = null; try { param.RequestEncoding = Encoding.Default; byte[] bs = param.RequestEncoding.GetBytes(param.Data); HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(param.Url); req.Method = "POST"; req.ContentType = "application/x-www-form-urlencoded"; req.ContentLength = bs.Length; if (!string.IsNullOrEmpty(param.Cookie)) { req.Headers[HttpRequestHeader.Cookie] = param.Cookie; } req.Referer = param.Cookie; using (Stream reqStream = req.GetRequestStream()) { reqStream.Write(bs, 0, bs.Length); } string strResponse = string.Empty; HttpWebResponse httpResponse = (HttpWebResponse)req.GetResponse(); using (Stream responseStream = httpResponse.GetResponseStream()) { Stream stream = responseStream; StreamReader streamReader = new StreamReader(stream, param.ResponseEncoding); strResponse = streamReader.ReadToEnd(); streamReader.Close(); } string retcookie = req.GetResponse().Headers["Set-Cookie"]; res = new HttpResultInfo() { Cookie = retcookie, StatusCode = httpResponse.StatusCode, StatusDescription = httpResponse.StatusDescription, Headers = httpResponse.Headers, ErrorMsg = string.Empty, Html = strResponse, ResponseUrl = httpResponse.ResponseUri, }; return res; } catch (Exception esx) { res = new HttpResultInfo() { ErrorMsg = esx.Message.ToString(), }; Console.WriteLine(esx.Message.ToString()); } return res; } /// <summary> /// Get /// </summary> /// <param name="httpParam">httpParam</param> /// <param name="param">param</param> /// <returns>结果</returns> public static HttpResultInfo Get(HttpRequestParams httpParam, Dictionary<string, string> param) { StringBuilder sb = new StringBuilder(); foreach (var item in param) { sb.AppendFormat("{0}={1}&", item.Key, item.Value); } httpParam.Data = sb.ToString(); return Get(httpParam); } /// <summary> /// Get /// </summary> /// <param name="param">param</param> /// <returns>结果</returns> public static HttpResultInfo Get(HttpRequestParams param) { HttpResultInfo ret = null; try { string strResult = string.Empty; HttpWebRequest httpRequest; HttpWebResponse httpResponse; string urlStr = param.Url; if (!string.IsNullOrEmpty(param.Data)) { urlStr = string.Format("{0}{1}", param.Url + "?", param.Data); } httpRequest = (HttpWebRequest)WebRequest.Create(new Uri(urlStr)); httpRequest.Timeout = param.Timeout; httpRequest.Method = "Get"; httpRequest.ContentType = param.ContentType; if (!string.IsNullOrEmpty(param.Cookie)) { httpRequest.Headers[HttpRequestHeader.Cookie] = param.Cookie; } //// 获取提交返回信息 httpResponse = (HttpWebResponse)httpRequest.GetResponse(); string returnStr = string.Empty; using (Stream st = httpResponse.GetResponseStream()) { returnStr = new StreamReader(st, param.ResponseEncoding).ReadToEnd(); } string cookie1 = httpResponse.Headers["Set-Cookie"]; ret = new HttpResultInfo() { Cookie = cookie1, StatusCode = httpResponse.StatusCode, StatusDescription = httpResponse.StatusDescription, Headers = httpResponse.Headers, ErrorMsg = string.Empty, Html = returnStr, ResponseUrl = httpResponse.ResponseUri, }; } catch (Exception ex) { Console.WriteLine(ex.ToString().ToString()); ret = new HttpResultInfo() { Html = string.Empty, ErrorMsg = ex.Message.ToString(), Cookie = string.Empty, }; } return ret; } /// <summary> /// GetQueryString /// </summary> /// <param name="param">param</param> /// <returns>结果</returns> public static string GetQueryString(Dictionary<string, string> param) { StringBuilder sb = new StringBuilder(); foreach (var item in param) { sb.AppendFormat("{0}={1}&", item.Key, item.Value); } if (sb.Length > 0) { sb = sb.Remove(sb.Length - 1, 1); } return sb.ToString(); } } /// <summary> /// 请求消息 /// </summary> public class HttpRequestParams { /// <summary> /// 请求编码 /// </summary> private Encoding requestEncoding = Encoding.Default; /// <summary> /// 响应编码 /// </summary> private Encoding responseEncoding = Encoding.Default; /// <summary> /// 请求超时时间(以毫秒为单位,默认180秒) /// </summary> private int timeout = 180000; /// <summary> /// 请求返回类型(默认text/html) /// </summary> private string contentType = "text/html"; /// <summary> /// HttpRequestParams /// </summary> public HttpRequestParams() { } /// <summary> /// 请求地址 /// </summary> public string Url { get; set; } /// <summary> /// 数据 /// </summary> public string Data { get; set; } /// <summary> /// Cookie /// </summary> public string Cookie { get; set; } /// <summary> /// ContentType /// </summary> public string ContentType { get { return this.contentType; } set { this.contentType = value; } } /// <summary> /// Referer /// </summary> public string Referer { get; set; } /// <summary> /// Timeout /// </summary> public int Timeout { get { return this.timeout; } set { this.timeout = value; } } /// <summary> /// RequestEncoding /// </summary> public Encoding RequestEncoding { get { return this.requestEncoding; } set { if (value == null) { throw new Exception("请求编码格式不能设置为空!"); } this.requestEncoding = value; } } /// <summary> /// 返回编码 /// </summary> public Encoding ResponseEncoding { get { return this.responseEncoding; } set { if (value == null) { throw new Exception("响应编码格式不能设置为空!"); } this.responseEncoding = value; } } } /// <summary> /// 返回消息 /// </summary> public class HttpResultInfo { /// <summary> /// Html /// </summary> public string Html { get; set; } /// <summary> /// Cookie /// </summary> public string Cookie { get; set; } /// <summary> /// IsSuccess /// </summary> public HttpStatusCode StatusCode { get; set; } /// <summary> /// ErrorMsg /// </summary> public string ErrorMsg { get; set; } /// <summary> /// 状态描述 /// </summary> public string StatusDescription { get; set; } /// <summary> /// 响应头 /// </summary> public WebHeaderCollection Headers { get; set; } /// <summary> /// 返回Uri /// </summary> public Uri ResponseUrl { get; set; } } }
View Code
3 举个简单的例子
相关文章推荐
- 登山-C#-加载Excel档案
- 关于在C#中对函数重载理解
- C# 实现16进制和字符串之间转换的代码[转]
- c# 获取方法所在的命名空间 类名 方法名
- C#操作符??和?:用法
- C#
- c# 加载xml
- C# ListView用法详解 很完整
- c# dataGridView导出数据到EXcel
- C#调用存储过程
- C# 6.0那些事(转)
- C#抽象类及其方法的学习
- C#实现 Eval
- LeetCode #12 Integer to Roman C# Solution
- C#静态代码检查工具StyleCode -- 自定义规则
- C#静态代码检查工具StyleCode -- 规则解析
- LeetCode #11 Container With Most Water C# Solution
- C#静态代码检查工具StyleCode -- 初探
- C#资深开发组 427769854
- .Net中C#的DllImport的用法