您的位置:首页 > 其它

网页抓取数据并分析,特别包括分页数据的抓取。

2013-03-05 10:46 363 查看
包括8个按钮,每个按钮下的代码都可运行(第5、6个可能需要调试一下)。
有基本的页面抓取,不含分页数据的;
有含分页数据,且【下一页】的链接是网址的;
有含分页数据,且【下一页】的链接是__doPostBack;
有含分页数据,且【下一页】的属性是.gif,可通过F12找到href的。
参考网址:http://www.cnblogs.com/ceachy/articles/CSharp_Retrive_Page_Document.html
http://www.cnblogs.com/ghfsusan/archive/2010/05/26/1744820.html
usingSystem;
usingSystem.Collections.Generic;
usingSystem.ComponentModel;
usingSystem.Data;
usingSystem.Drawing;
usingSystem.Linq;
usingSystem.Text;
usingSystem.Windows.Forms;

usingSystem.IO;
usingSystem.Net;

namespaceWindowsFormsApplication1
{
publicpartialclassForm1:Form
{
publicForm1()
{
InitializeComponent();
}

//privatevoidbutton1_Click(objectsender,EventArgse)
//{
//MessageBox.Show("helloworld.");
//}

//WebBrowserweb1=newWebBrowser();
//web1.Navigate("http://www.xjflcp.com/ssc/");
//web.DocumentCompleted+=newWebBrowserDocumentCompletedEventHandler(web_DocumentCompleted);
//voidweb_DocumentCompleted(objectsender,WebBrowserDocumentCompletedEventArgse)
//{
//WebBrowserweb=(WebBrowser)sender;
//HtmlElementCollectionElementCollection=web.Document.GetElementsByTagName("Table");
//foreach(HtmlElementiteminElementCollection)
//{
//File.AppendAllText("Kaijiang_xj.txt",item.InnerText);
//}
//}


//根据Url地址得到网页的html源码
privatestringGetWebContent(stringUrl)
{
stringstrResult="";
try
{
HttpWebRequestrequest=(HttpWebRequest)WebRequest.Create(Url);//声明一个HttpWebRequest请求
request.Timeout=30000;//设置连接超时时间
request.Headers.Set("Pragma","no-cache");
HttpWebResponseresponse=(HttpWebResponse)request.GetResponse();
StreamstreamReceive=response.GetResponseStream();
Encodingencoding=Encoding.GetEncoding("GB2312");
StreamReaderstreamReader=newStreamReader(streamReceive,encoding);
strResult=streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
returnstrResult;
}
//为了使用HttpWebRequest和HttpWebResponse,需填名字空间引用usingSystem.Net;
//以下是程序具体实现过程:
privatevoidbutton1_Click(objectsender,EventArgse)
{
//要抓取的URL地址
stringUrl="http://list.mp3.baidu.com/topso/mp3topsong.html?id=1#top2";
//得到指定Url的源码
stringstrWebContent=GetWebContent(Url);
richTextBox1.Text=strWebContent;
//取出和数据有关的那段源码
intiBodyStart=strWebContent.IndexOf("<body",0);
intiStart=strWebContent.IndexOf("歌曲TOP500",iBodyStart);
intiTableStart=strWebContent.IndexOf("<table",iStart);
intiTableEnd=strWebContent.IndexOf("</table>",iTableStart);
stringstrWeb=strWebContent.Substring(iTableStart,iTableEnd-iTableStart+8);
//生成HtmlDocument
WebBrowserwebb=newWebBrowser();
webb.Navigate("about:blank");
HtmlDocumenthtmldoc=webb.Document.OpenNew(true);
htmldoc.Write(strWeb);
HtmlElementCollectionhtmlTR=htmldoc.GetElementsByTagName("TR");
foreach(HtmlElementtrinhtmlTR)
{
stringstrID=tr.GetElementsByTagName("TD")[0].InnerText;
stringstrName=tr.GetElementsByTagName("TD")[1].InnerText;
stringstrSinger=tr.GetElementsByTagName("TD")[1].InnerText;
//插入DataTable
strID=strID.Replace(".","");
//AddLine(strID,strName,strSinger,"0");

//stringstrID=tr.GetElementsByTagName("TD")[0].InnerText;
//stringstrName=SplitName(tr.GetElementsByTagName("TD")[1].InnerText,"MusicName");
//stringstrSinger=SplitName(tr.GetElementsByTagName("TD")[1].InnerText,"Singer");
////插入DataTable
//strID=strID.Replace(".","");
//AddLine(strID,strName,strSinger,"0");

//stringstrID1=tr.GetElementsByTagName("TD")[2].InnerText;
//stringstrName1=SplitName(tr.GetElementsByTagName("TD")[3].InnerText,"MusicName");
//stringstrSinger1=SplitName(tr.GetElementsByTagName("TD")[3].InnerText,"Singer");
////插入DataTable
//strID1=strID1.Replace(".","");
//AddLine(strID1,strName1,strSinger1,"0");

//stringstrID2=tr.GetElementsByTagName("TD")[4].InnerText;
//stringstrName2=SplitName(tr.GetElementsByTagName("TD")[5].InnerText,"MusicName");
//stringstrSinger2=SplitName(tr.GetElementsByTagName("TD")[5].InnerText,"Singer");
////插入DataTable
//strID2=strID2.Replace(".","");
//AddLine(strID2,strName2,strSinger2,"0");
}
//插入数据库
//InsertData(dt);

//dataGridView1.DataSource=dt.DefaultView;
}

privatevoidbutton2_Click(objectsender,EventArgse)
{
try{
WebClientMyWebClient=newWebClient();
MyWebClient.Credentials=CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[]pageData=MyWebClient.DownloadData("http://bbs.cup.edu.cn/cupbbs/ThreadList.aspx?fid=51");//从指定网站下载数据
//stringpageHtml=Encoding.Default.GetString(pageData);//如果获取网站页面采用的是GB2312,则使用这句
stringpageHtml=Encoding.UTF8.GetString(pageData);//如果获取网站页面采用的是UTF-8,则使用这句
Console.WriteLine(pageHtml);//在控制台输入获取的内容
using(StreamWritersw=newStreamWriter("C:\\Users\\yuan\\Desktop\\ouput.html"))//将获取的内容写入文本
{
sw.Write(pageHtml);
}
Console.ReadLine();//让控制台暂停,否则一闪而过了
}
catch(WebExceptionwebEx){
Console.WriteLine(webEx.Message.ToString());
}
}

voidweb_DocumentCompleted(objectsender,WebBrowserDocumentCompletedEventArgse)
{
WebBrowserweb=(WebBrowser)sender;
HtmlElementCollectionElementCollection=web.Document.GetElementsByTagName("Table");
foreach(HtmlElementiteminElementCollection)
{
File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton3.txt",item.InnerText);
}
}
privatevoidbutton3_Click(objectsender,EventArgse)
{
WebBrowserweb=newWebBrowser();
web.Navigate("http://www.chinahighway.gov.cn/html/staticHtml/front/index_lkcx.html");
web.DocumentCompleted+=newWebBrowserDocumentCompletedEventHandler(web_DocumentCompleted);
}

privatevoidbutton4_Click(objectsender,EventArgse)
{
string_StrResponse="";
HttpWebRequest_WebRequest=(HttpWebRequest)WebRequest.Create("http://bbs.cup.edu.cn/cupbbs/ThreadList.aspx?fid=51");
_WebRequest.UserAgent="MOZILLA/4.0(COMPATIBLE;MSIE7.0;WINDOWSNT5.2;.NETCLR1.1.4322;.NETCLR2.0.50727;.NETCLR3.0.04506.648;.NETCLR3.5.21022;.NETCLR3.0.4506.2152;.NETCLR3.5.30729)";
_WebRequest.Method="Get";
WebResponse_WebResponse=_WebRequest.GetResponse();
StreamReader_ResponseStream=newStreamReader(_WebResponse.GetResponseStream(),System.Text.Encoding.GetEncoding("gb2312"));
_StrResponse=_ResponseStream.ReadToEnd();
_WebResponse.Close();
_ResponseStream.Close();
File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton4.txt",_StrResponse);
}

privatevoidbutton5_Click(objectsender,EventArgse)
{
System.Net.WebClientWebClientObj=newSystem.Net.WebClient();
System.Collections.Specialized.NameValueCollectionPostVars=newSystem.Collections.Specialized.NameValueCollection();
PostVars.Add("__VIEWSTATE","dDwxNDMyMjQ3NTY4O3Q8O2w8aTwxPjs+O2w8dDw7bDxpPDM+O2k8Nz47PjtsPHQ8O2w8aTwwPjtpPDE+O2k8Mj47aTwzPjtpP........省略了,太长了.......");
//PostVars.Add("__EVENTVALIDATION","此处是您需要提前得到的信息");
PostVars.Add("__EVENTTARGET","grdThreadList:_ctl27:_ctl3");
//PostVars.Add("__EVENTARGUMENT","");
//strings="23$74$56";
//string[]str=newstring[5];
//str=s.Split('$');
WebClientObj.Headers.Add("ContentType","application/x-www-form-urlencoded");
try
{
byte[]byte1=WebClientObj.UploadValues("http://bbs.cup.edu.cn/cupbbs/ThreadList.aspx?fid=51","POST",PostVars);
stringResponseStr=Encoding.UTF8.GetString(byte1);//得到当前页面对应的html文本字符串
//GetPostValue(ResponseStr);//得到当前页面对应的__VIEWSTATE等上面需要的信息,为抓取下一页面使用
//SaveMessage(ResponseStr);//保存自己关心的内容到数据库中
File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton5.txt",ResponseStr);
}
catch(Exceptionex)
{
Console.WriteLine(ex.Message);
}
}

privatestaticstringcurrent__viewstate="";//保存当前页面对应的__VIEWSTATE等上面需要的信息,为再次点击按钮(抓取下一页面)使用
privatestringGetPostValue(stringResponseStr)
{
//......
return"";//略//解析ResponseStr,得到__VIEWSTATE的值
}
privatevoidbutton6_Click(objectsender,EventArgse)
{
System.Net.WebClientWebClientObj=newSystem.Net.WebClient();
System.Collections.Specialized.NameValueCollectionPostVars=newSystem.Collections.Specialized.NameValueCollection();
PostVars.Add("__VIEWSTATE",current__viewstate);
////PostVars.Add("__EVENTVALIDATION","此处是您需要提前得到的信息");
PostVars.Add("__EVENTTARGET","grdThreadList:_ctl27:_ctl3");//通过for,改变其值,ctl1,ctl2,ctl3....
////PostVars.Add("__EVENTARGUMENT","");
////strings="23$74$56";
////string[]str=newstring[5];
////str=s.Split('$');
WebClientObj.Headers.Add("ContentType","application/x-www-form-urlencoded");
try
{
byte[]byte1=WebClientObj.UploadValues("http://www.chinahighway.gov.cn/html/staticHtml/front/index_lkcx.html","POST",PostVars);
stringResponseStr=Encoding.UTF8.GetString(byte1);//得到当前页面对应的html文本字符串
current__viewstate=GetPostValue(ResponseStr);//得到当前页面对应的__VIEWSTATE等上面需要的信息,为抓取下一页面使用
//SaveMessage(ResponseStr);//保存自己关心的内容到数据库中
File.AppendAllText("C:\\Users\\yuan\\Desktop\\ouputbutton6.txt",ResponseStr);
}
catch(Exceptionex)
{
Console.WriteLine(ex.Message);
}
}

privatevoidbutton7_Click(objectsender,EventArgse)
{
try
{
WebClientMyWebClient=newWebClient();
MyWebClient.Credentials=CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[]pageData=MyWebClient.DownloadData("http://gb.cri.cn/42071/2013/03/05/3245s4038640_2.htm");//从指定网站下载数据
stringpageHtml=Encoding.Default.GetString(pageData);//如果获取网站页面采用的是GB2312,则使用这句
//stringpageHtml=Encoding.UTF8.GetString(pageData);//如果获取网站页面采用的是UTF-8,则使用这句
Console.WriteLine(pageHtml);//在控制台输入获取的内容
using(StreamWritersw=newStreamWriter("C:\\Users\\yuan\\Desktop\\ouput7.html"))//将获取的内容写入文本
{
sw.Write(pageHtml);
}
Console.ReadLine();//让控制台暂停,否则一闪而过了
}
catch(WebExceptionwebEx)
{
Console.WriteLine(webEx.Message.ToString());
}
}

//对分页数据抓取,当网页上【下一页】的属性是.gif格式的时候,而且鼠标停在【下一页】上面,页面下面显示的网址不完整(网址太长,只能看到部分),
//可以通过在网页上点击右键【查看源文件】,或F12【脚本】,找到对应的href,即可得到网址。
privatevoidbutton8_Click(objectsender,EventArgse)
{
try
{
WebClientMyWebClient=newWebClient();
MyWebClient.Credentials=CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[]pageData=MyWebClient.DownloadData("http://www.chinahighway.gov.cn/roadInfo/queryRoadInfo.do?queryType=map&startDate=&cantonName=&cantonCode=&infoType=3&endDate=&startPlanDate=&_page_size=50&roadName=&mapList=-1&endRealDate=&roadCode=&provinceList=-1&endPlanDate=&startRealDate=&page=3");//从指定网站下载数据
stringpageHtml=Encoding.Default.GetString(pageData);//如果获取网站页面采用的是GB2312,则使用这句
//stringpageHtml=Encoding.UTF8.GetString(pageData);//如果获取网站页面采用的是UTF-8,则使用这句
Console.WriteLine(pageHtml);//在控制台输入获取的内容
using(StreamWritersw=newStreamWriter("C:\\Users\\yuan\\Desktop\\ouput8.txt"))//将获取的内容写入文本
{
sw.Write(pageHtml);
}
Console.ReadLine();//让控制台暂停,否则一闪而过了
}
catch(WebExceptionwebEx)
{
Console.WriteLine(webEx.Message.ToString());
}
}

}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: