您的位置:首页 > 编程语言 > ASP

asp.net 网页抓取内容

2016-04-08 17:59 591 查看
using System;

using System.Collections.Generic;

using System.Linq;

using System.Web;

//

using System.Net;

using System.IO;

using System.Text.RegularExpressions;

using System.Text;

namespace WSYL.Web.Common

{

public static class GetSteamShipInfo

{

public static string GetWebSite(string steamshipname,int itype)

{

if (steamshipname == null || steamshipname.Trim() == "")

return null;

//step1: get html from url

string urlToCrawl = @"网址";

//generate http request

HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);

//use GET method to get url's html

req.Method = "GET";

//use request to get response

HttpWebResponse resp = (HttpWebResponse)req.GetResponse();

// 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死

// string htmlCharset = "UTF-8";

string htmlCharset = "utf-8";

//use songtaste's html's charset GB2312 to decode html

//otherwise will return messy code

Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);

StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);

//read out the returned html

string respHtml = sr.ReadToEnd();

//第三种获取内容

//Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

//需要获取的代码开始和结尾内容

Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

// txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value;

if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="")

return respHtml = "";

if(itype==0)

{

respHtml = TitleMatch2.Groups[1].Value.ToString();

}

if(itype==1)

{

respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString());

}

if (itype == 2)

{

respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString());

}

return respHtml;

}

/// <summary>

/// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作

/// </summary>

/// <param name="strHtml">标签内容</param>

/// <returns></returns>

private static string StripHtml(string strHtml)

{

Regex objRegExp = new Regex("<(.|\n)+?>");

string strOutput = objRegExp.Replace(strHtml, "");

strOutput = strOutput.Replace("<", "<");

strOutput = strOutput.Replace(">", ">");

//把所有空格变为一个空格

Regex r = new Regex(@"\s+");

strOutput = r.Replace(strOutput, " ");

return strOutput.Trim();

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息