您的位置:首页 > 编程语言 > C#

C#爬页面总结

2016-05-05 18:52 489 查看
错误的思路是这样的:发送一个访问页面的请求过去,得到一个html页面,然后我要的数据全都在这上面。后来发现不是这样的,也猜到可能是页面加载之后还有js代码的ajax的异步加载,那么问题来了?我是不是要等到这些ajax请求结束之后,我才能拿到数据呢?我怎么判断有没有结束?我要等多久合适呢?嗯,仔细向下,还有个问题是,发送的post请求过去,又没有浏览器渲染,谁去执行这些js代码呢?

实际上是这样的:发送一个访问页面的请求过去,上面可能有我要的数据,也可能没有,如果没有,那就看看是不是要发另外的请求。

1、相关工具和技巧

1.1 Chrome浏览器
打开浏览器,按F12进入调式状态,例如打开www.cnblogs.com
 

using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Net;
using System.Reflection;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace Business
{
/// <summary>
/// HttpCallHelper
/// </summary>
public class HttpCallHelper
{
/// <summary>
/// post
/// </summary>
/// <param name="param">param</param>
/// <returns>HttpResultInfo</returns>
public static HttpResultInfo Post(HttpRequestParams param)
{
HttpResultInfo res = null;
try
{
param.RequestEncoding = Encoding.Default;
byte[] bs = param.RequestEncoding.GetBytes(param.Data);
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(param.Url);
req.Method = "POST";
req.ContentType = "application/x-www-form-urlencoded";
req.ContentLength = bs.Length;
if (!string.IsNullOrEmpty(param.Cookie))
{
req.Headers[HttpRequestHeader.Cookie] = param.Cookie;
}
req.Referer = param.Cookie;
using (Stream reqStream = req.GetRequestStream())
{
reqStream.Write(bs, 0, bs.Length);
}
string strResponse = string.Empty;
HttpWebResponse httpResponse = (HttpWebResponse)req.GetResponse();
using (Stream responseStream = httpResponse.GetResponseStream())
{
Stream stream = responseStream;
StreamReader streamReader = new StreamReader(stream, param.ResponseEncoding);
strResponse = streamReader.ReadToEnd();
streamReader.Close();
}
string retcookie = req.GetResponse().Headers["Set-Cookie"];
res = new HttpResultInfo()
{
Cookie = retcookie,
StatusCode = httpResponse.StatusCode,
StatusDescription = httpResponse.StatusDescription,
Headers = httpResponse.Headers,
ErrorMsg = string.Empty,
Html = strResponse,
ResponseUrl = httpResponse.ResponseUri,
};
return res;
}
catch (Exception esx)
{
res = new HttpResultInfo()
{
ErrorMsg = esx.Message.ToString(),
};
Console.WriteLine(esx.Message.ToString());
}
return res;
}
/// <summary>
/// Get
/// </summary>
/// <param name="httpParam">httpParam</param>
/// <param name="param">param</param>
/// <returns>结果</returns>
public static HttpResultInfo Get(HttpRequestParams httpParam, Dictionary<string, string> param)
{
StringBuilder sb = new StringBuilder();
foreach (var item in param)
{
sb.AppendFormat("{0}={1}&", item.Key, item.Value);
}
httpParam.Data = sb.ToString();
return Get(httpParam);
}
/// <summary>
/// Get
/// </summary>
/// <param name="param">param</param>
/// <returns>结果</returns>
public static HttpResultInfo Get(HttpRequestParams param)
{
HttpResultInfo ret = null;
try
{
string strResult = string.Empty;
HttpWebRequest httpRequest;
HttpWebResponse httpResponse;
string urlStr = param.Url;
if (!string.IsNullOrEmpty(param.Data))
{
urlStr = string.Format("{0}{1}", param.Url + "?", param.Data);
}
httpRequest = (HttpWebRequest)WebRequest.Create(new Uri(urlStr));
httpRequest.Timeout = param.Timeout;
httpRequest.Method = "Get";
httpRequest.ContentType = param.ContentType;
if (!string.IsNullOrEmpty(param.Cookie))
{
httpRequest.Headers[HttpRequestHeader.Cookie] = param.Cookie;
}
//// 获取提交返回信息
httpResponse = (HttpWebResponse)httpRequest.GetResponse();
string returnStr = string.Empty;
using (Stream st = httpResponse.GetResponseStream())
{
returnStr = new StreamReader(st, param.ResponseEncoding).ReadToEnd();
}
string cookie1 = httpResponse.Headers["Set-Cookie"];
ret = new HttpResultInfo()
{
Cookie = cookie1,
StatusCode = httpResponse.StatusCode,
StatusDescription = httpResponse.StatusDescription,
Headers = httpResponse.Headers,
ErrorMsg = string.Empty,
Html = returnStr,
ResponseUrl = httpResponse.ResponseUri,
};
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString().ToString());
ret = new HttpResultInfo()
{
Html = string.Empty,
ErrorMsg = ex.Message.ToString(),
Cookie = string.Empty,
};
}
return ret;
}
/// <summary>
/// GetQueryString
/// </summary>
/// <param name="param">param</param>
/// <returns>结果</returns>
public static string GetQueryString(Dictionary<string, string> param)
{
StringBuilder sb = new StringBuilder();
foreach (var item in param)
{
sb.AppendFormat("{0}={1}&", item.Key, item.Value);
}
if (sb.Length > 0)
{
sb = sb.Remove(sb.Length - 1, 1);
}
return sb.ToString();
}
}
/// <summary>
/// 请求消息
/// </summary>
public class HttpRequestParams
{
/// <summary>
/// 请求编码
/// </summary>
private Encoding requestEncoding = Encoding.Default;
/// <summary>
/// 响应编码
/// </summary>
private Encoding responseEncoding = Encoding.Default;
/// <summary>
/// 请求超时时间(以毫秒为单位,默认180秒)
/// </summary>
private int timeout = 180000;
/// <summary>
/// 请求返回类型(默认text/html)
/// </summary>
private string contentType = "text/html";
/// <summary>
/// HttpRequestParams
/// </summary>
public HttpRequestParams()
{
}
/// <summary>
/// 请求地址
/// </summary>
public string Url
{
get;
set;
}
/// <summary>
/// 数据
/// </summary>
public string Data
{
get;
set;
}
/// <summary>
/// Cookie
/// </summary>
public string Cookie
{
get;
set;
}

/// <summary>
/// ContentType
/// </summary>
public string ContentType
{
get { return this.contentType; }
set { this.contentType = value; }
}
/// <summary>
/// Referer
/// </summary>
public string Referer
{
get;
set;
}
/// <summary>
/// Timeout
/// </summary>
public int Timeout
{
get { return this.timeout; }
set { this.timeout = value; }
}
/// <summary>
/// RequestEncoding
/// </summary>
public Encoding RequestEncoding
{
get { return this.requestEncoding; }
set
{
if (value == null)
{
throw new Exception("请求编码格式不能设置为空!");
}
this.requestEncoding = value;
}
}
/// <summary>
/// 返回编码
/// </summary>
public Encoding ResponseEncoding
{
get { return this.responseEncoding; }
set
{
if (value == null)
{
throw new Exception("响应编码格式不能设置为空!");
}
this.responseEncoding = value;
}
}
}
/// <summary>
/// 返回消息
/// </summary>
public class HttpResultInfo
{
/// <summary>
/// Html
/// </summary>
public string Html
{
get;
set;
}
/// <summary>
/// Cookie
/// </summary>
public string Cookie
{
get;
set;
}
/// <summary>
/// IsSuccess
/// </summary>
public HttpStatusCode StatusCode
{
get;
set;
}
/// <summary>
/// ErrorMsg
/// </summary>
public string ErrorMsg
{
get;
set;
}
/// <summary>
/// 状态描述
/// </summary>
public string StatusDescription { get; set; }
/// <summary>
/// 响应头
/// </summary>
public WebHeaderCollection Headers { get; set; }
/// <summary>
/// 返回Uri
/// </summary>
public Uri ResponseUrl { get; set; }
}
}


View Code
3 举个简单的例子

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: