自己写的抓取Google赞助商链接的页面源码
2007-02-02 18:56
621 查看
private void GetInfo()
{
WebClient InfoWclient = new WebClient();
string StrGatherURL = TextBoxGatherURL.Text + HttpUtility.UrlEncode(TextBoxAdWord.Text);
string StrGatherBaseURL = StrGatherURL.Substring(0, StrGatherURL.Replace("//", "").IndexOf("/") + 2);
Stream InfoStream = InfoWclient.OpenRead(StrGatherURL);
StreamReader InfoStreamReader = new StreamReader(InfoStream, Encoding.GetEncoding("GB2312"));
string StrFullInfo = InfoStreamReader.ReadToEnd();
Regex AdListTypeRegex = new Regex("更多赞助商链接");
Regex AdListInfoRegex;
if (AdListTypeRegex.IsMatch(StrFullInfo))
{
AdListInfoRegex = new Regex("<table cellspacing=0 cellpadding=0 width=25% align=right bgcolor=#ffffff border=0>.*更多赞助商链接 »</a></font></td></tr></table>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
else
{
AdListInfoRegex = new Regex("<table cellspacing=0 cellpadding=0 width=25% align=right bgcolor=#ffffff border=0>.*<font size=-1></font></td></tr></table>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
string StrAdList = AdListInfoRegex.Match(StrFullInfo).ToString();
Regex AdLinkTitleRegex = new Regex("<font size=.?0.*</font></a>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
Regex AdLinkSrcRegex = new Regex("/url.*target=nw", RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection AdLinkSrcMatchCollection = AdLinkSrcRegex.Matches(StrAdList.Replace("target=nw>", "target=nw\n"));
MatchCollection AdLinkTitleMatchCollection = AdLinkTitleRegex.Matches(StrAdList.ToLower().Replace("<br>", "\n"));
DataTable AdListDt = new DataTable();
AdListDt.Columns.Add("Place");
AdListDt.Columns.Add("Title");
AdListDt.Columns.Add("Value");
for (int i = 0; i < AdLinkSrcMatchCollection.Count; i++)
{
string StrAdLinkSrc = AdLinkSrcMatchCollection[i].ToString();
string StrAdListItemValue = StrGatherBaseURL + StrAdLinkSrc.Replace(" target=nw", "");
DataRow AdAddRow = AdListDt.NewRow();
AdAddRow["Place"] = (i+1).ToString();
AdAddRow["Title"] = AdLinkTitleMatchCollection[i].ToString().Replace("<font size=+0>", "").Replace("</font></a>", "");
AdAddRow["Value"] = StrAdListItemValue;
AdListDt.Rows.Add(AdAddRow);
}
GridViewAdList.DataSource = AdListDt;
GridViewAdList.Columns[0].HeaderText = "位置";
GridViewAdList.Columns[0].Width = 40;
GridViewAdList.Columns[1].HeaderText = "标题";
GridViewAdList.Columns[1].Width = 200;
GridViewAdList.Columns[2].HeaderText = "链接";
GridViewAdList.Columns[2].Visible = false;
}
{
WebClient InfoWclient = new WebClient();
string StrGatherURL = TextBoxGatherURL.Text + HttpUtility.UrlEncode(TextBoxAdWord.Text);
string StrGatherBaseURL = StrGatherURL.Substring(0, StrGatherURL.Replace("//", "").IndexOf("/") + 2);
Stream InfoStream = InfoWclient.OpenRead(StrGatherURL);
StreamReader InfoStreamReader = new StreamReader(InfoStream, Encoding.GetEncoding("GB2312"));
string StrFullInfo = InfoStreamReader.ReadToEnd();
Regex AdListTypeRegex = new Regex("更多赞助商链接");
Regex AdListInfoRegex;
if (AdListTypeRegex.IsMatch(StrFullInfo))
{
AdListInfoRegex = new Regex("<table cellspacing=0 cellpadding=0 width=25% align=right bgcolor=#ffffff border=0>.*更多赞助商链接 »</a></font></td></tr></table>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
else
{
AdListInfoRegex = new Regex("<table cellspacing=0 cellpadding=0 width=25% align=right bgcolor=#ffffff border=0>.*<font size=-1></font></td></tr></table>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
string StrAdList = AdListInfoRegex.Match(StrFullInfo).ToString();
Regex AdLinkTitleRegex = new Regex("<font size=.?0.*</font></a>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
Regex AdLinkSrcRegex = new Regex("/url.*target=nw", RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection AdLinkSrcMatchCollection = AdLinkSrcRegex.Matches(StrAdList.Replace("target=nw>", "target=nw\n"));
MatchCollection AdLinkTitleMatchCollection = AdLinkTitleRegex.Matches(StrAdList.ToLower().Replace("<br>", "\n"));
DataTable AdListDt = new DataTable();
AdListDt.Columns.Add("Place");
AdListDt.Columns.Add("Title");
AdListDt.Columns.Add("Value");
for (int i = 0; i < AdLinkSrcMatchCollection.Count; i++)
{
string StrAdLinkSrc = AdLinkSrcMatchCollection[i].ToString();
string StrAdListItemValue = StrGatherBaseURL + StrAdLinkSrc.Replace(" target=nw", "");
DataRow AdAddRow = AdListDt.NewRow();
AdAddRow["Place"] = (i+1).ToString();
AdAddRow["Title"] = AdLinkTitleMatchCollection[i].ToString().Replace("<font size=+0>", "").Replace("</font></a>", "");
AdAddRow["Value"] = StrAdListItemValue;
AdListDt.Rows.Add(AdAddRow);
}
GridViewAdList.DataSource = AdListDt;
GridViewAdList.Columns[0].HeaderText = "位置";
GridViewAdList.Columns[0].Width = 40;
GridViewAdList.Columns[1].HeaderText = "标题";
GridViewAdList.Columns[1].Width = 200;
GridViewAdList.Columns[2].HeaderText = "链接";
GridViewAdList.Columns[2].Visible = false;
}
相关文章推荐
- phantomjs.exe 无界面版chrome抓取网页源码提取页面链接
- C# 抓取页面中的所有链接
- 在自己的页面上添加GOOGLE的PageRank
- Google 更新自己的APP 链接
- Python抓取页面中超链接(URL)的三中方法比较(HTMLParser、pyquery、正则表达式) <转>
- 自己写的一个抓取页面email地址的小程序
- 自己写的一个正则表达式抓取页面内容
- 网络爬虫初步:从一个入口链接開始不断抓取页面中的网址并入库
- SEO 2009 Google优化排名之页面优化的链接因素
- php防盗链,php ci在control里面控制除了自己站内的链接点击跳转,其他来源的都跳到站内页面
- 页面抓取特定链接
- 自己的一些 Demo,源码链接
- 下载Google官方/CM Android源码自己主动又一次開始的Shell脚本
- 网络爬虫初步:从一个入口链接开始不断抓取页面中的网址并入库
- Python 抓取google链接代码
- python - 抓取页面上的链接
- 1f12可以查看最终的静态html页面,和JavaScript源代码 ,那自己写js源码不就泄露了吗 2由JavaScript代码暴露,重新认识”前端“和”后端“
- 将google搜索的内容显示在自己的页面上
- python抓取google链接原理详解
- Python实现抓取页面上链接的简单爬虫分享