您的位置:首页 > Web前端 > HTML

获取搜狗引擎 添加任意关键词 后的html源码(c#)

2016-03-22 21:45 567 查看
一直在不断测试:希望有大神指示如何获取 搜狗微信文章获取)

class Program
{
private static ManageKeywordBll mkBll = new ManageKeywordBll();
private static WeixinquanMessageBll wqBll = new WeixinquanMessageBll();
private static CookieHelp ch = new CookieHelp();
private static string userAgentLiShi = "";
private static string result = "";//返回html结果
private static string[] kw;//关键词
private static string id = "";//获取 验证码返回的 id用作SUNID的cookie值

static void Main(string[] args)
{
#region 开始
ManageKeyword model = new ManageKeyword();
List<ManageKeyword> list = new List<ManageKeyword>();
string cookieAll = "";//防止cookie信息丢失,URL链接失效问题
model.Module = "6,";
list = mkBll.GetList(model);
foreach (ManageKeyword data in list)
{
List<WeixinquanMessage> listWeixin = new List<WeixinquanMessage>();
string url = "";
string keyword = data.KeyWord;
if (keyword.IndexOf("*") != -1)
{
keyword = keyword.Replace("*", " ");
}
//long datetime = ConvertDateTimeToInt(true, DateTime.Now);//13位时间戳
//long datetime16 = ConvertDateTimeToInt(false, DateTime.Now);//16位时间戳
url = "http://weixin.sogou.com/weixin?"
+ "query=" + keyword + "&"
+ "_sug_type_=1&"
+ "sut=0&"
+ "sourceid=inttime_all&"//当天的内容inttime_day 全部时间的inttime_all
+ "ri=0&"
+ "_sug_=n&"
+ "type=2&"//type=2 微信号 type=1 公众号
+ "ie=utf-8&"
//+ "sst0=" + datetime + "&"
+ "interation=&"
//+ "interV=kKIOkrELjboJmLkElbYTkKIKmbELjbkRmLkElbk%3D_1893302304&"
+ "tsn=0&"//0:全部时间,1:一天内,2:一星期内
+ "page=(*)";
//+ "dp=1";
//url = HttpUtility.UrlEncode(url);

cookieAll = GetHtmlByYzm(url, keyword);
//由于读取一次URL之后有10条新闻需要处理 在这处理10新闻的间隙中
//cookiecontianer或许保存了过期的SUID,与之前读出URL是产生的SUID不一致 导致 访问内页新闻详情时链接失效
listWeixin = GetTitleContent(result, keyword, cookieAll);
foreach (WeixinquanMessage wm in listWeixin)
{
wm.KeyWord = data.KeyWord;
wqBll.Add(wm);
}

cookieAll = "";//清空一下
url = "";//清空一下
}
#endregion
Console.WriteLine("OK");
Console.ReadKey();
}

/// <summary>
/// 将日期转化为Unix时间戳
/// </summary>
/// <param name="time">时间日期格式</param>
/// <returns>长整型数据</returns>
private static long ConvertDateTimeToInt(bool flag, System.DateTime time)
{
long t = 0;
System.DateTime startTime = TimeZone.CurrentTimeZone.ToLocalTime(new System.DateTime(1970, 1, 1, 0, 0, 0, 0));
if (flag)
{
t = (time.Ticks - startTime.Ticks) / 10000;   //除10000调整为13位
}
else
{
t = time.Ticks - startTime.Ticks;//16位
}
return t;
}

/// <summary>
/// 将时间戳转化为时间日期格式
/// </summary>
/// <param name="timeStamp"></param>
/// <returns></returns>
private static DateTime ConvertStringToDateTime(string timeStamp)
{
DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1));
long lTime = long.Parse(timeStamp + "0000000");
TimeSpan toNow = new TimeSpan(lTime);

return dtStart.Add(toNow);
}

/// <summary>
/// 随机生成一个useragent
/// </summary>
/// <returns></returns>
private static void RandomUserAgent()
{
Random r = new Random();
int random = r.Next(0, UserAgentList.userAgentShuZu.Length);
userAgentLiShi = UserAgentList.userAgentShuZu[random];
}

/// <summary>
/// 获取验证码返回的id值
/// </summary>
/// <param name="keywords">关键词</param>
/// <returns>id值</returns>
private static string GetSNUID(string keywords)
{
string id = "";
string referer = "";
for (int j = 0; j < 10; j++)
{
ch.MyCookieContainer = new CookieContainer();//清空掉原有的cookie
ch.Cookie = "";//清空掉原有的cookie
long tc = ConvertDateTimeToInt(true, DateTime.Now) / 1000;
string imageUrl = "http://weixin.sogou.com/antispider/util/seccode.php?tc=" + tc.ToString() + "";//图片的src地址
Image image = ch.getImage(imageUrl, "UTF-8");
CodeFactory cf = new CodeFactory(CodeType.YDM);
CodeModel cm = cf.GetCode(image);
string url = "http://weixin.sogou.com/antispider/thank.php";
string postdata = "c=" + cm.Code + "&r=/weixin?type=2&query=" + keywords + "&ie=utf8&_sug_=n&_sug_type_=&v=5";
referer = HttpUtility.UrlEncode("http://weixin.sogou.com/antispider/?from=/weixin?type=2&query=" + keywords + "&ie=utf8&_sug_=n&_sug_type_=");
id = ch.PostAndGetResult(url, "UTF-8", postdata, "", "", referer);

if (id.IndexOf("解封成功,正在为您跳转来源地址...") > -1)
{
break;
}
}

id = id.Substring(id.IndexOf("id")).Replace("\"", "").Replace(":", "").Replace(" ", "");
id = id.Substring(2, id.Length - 3);

return id;
}

/// <summary>
/// 通过破解验证码的方式获取网页源码
/// </summary>
/// <param name="urls">要访问的URL</param>
/// <param name="keywords">要访问的关键词</param>
/// <returns>cookie值</returns>
private static string GetHtmlByYzm(string urls,string keywords)
{
bool isOk = false;
string cookies = "";
result = ch.getHtml(urls, "UTF-8", "", "", "", "	Mozilla / 5.0(Windows NT 10.0; WOW64; rv: 45.0) Gecko / 20100101 Firefox / 45.0");
cookies = ch.Cookie;

for ( int i = 0; i < 10; i++)
{
isOk = true;
if (result.IndexOf("请输入验证码") > -1)
{
isOk = false;
if (cookies.IndexOf("SNUID") > -1)
{
cookies = cookies.Substring(0, cookies.IndexOf("SNUID") - 1);
}
//经测试如果出现验证码 seccodeRight=success;refresh=1,必须得用 successCount =1|Thu, 31 Mar 2016 07:58:56 GMT; 可以不用
cookies += ";SUV=00677DCD6F11A86256FCD05375656513;seccodeRight=success;refresh=1;SNUID=" + id;
result = ch.getHtml(urls, "UTF-8", cookies, "", "", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");//Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0
}

//解决出现验证码 获取一次id 可执行4-5次
if (result.IndexOf("请输入验证码") > -1)
{
id = GetSNUID(keywords);
}

if (isOk)
{
break;
}
}

return cookies;
}

/// <summary>
/// 把中文的标点符号转换回来
/// </summary>
/// <param name="parm"></param>
/// <returns></returns>
private static string ReplaceZhongWenBiaoDian(string parm)
{
string result = "";
string[] a = { " ", "&", """, "'", "“", "”", "—", "<", ">", "·", "…" };
string[] b = { " ", "&", "\"", "'", "“", "”", "—", "<", ">", "·", "…" };
for (int i = 0; i < a.Length; i++)
{
if (parm.Contains(a[i]))
{
parm = parm.Replace(a[i], b[i]);
}
}

result = parm;
return result;
}

/// <summary>
/// 把html 标签里的class style 全部去掉
/// </summary>
/// <param name="htmltag"></param>
/// <returns></returns>
private static string RegexReplaceHtmlTag(string htmltag)
{
string result = "";
List<string> list = new List<string>();
Regex regex = new Regex(@"<([a-z|A-Z|0-9]+?) [^>]*?>", RegexOptions.IgnoreCase);
MatchCollection mtc = regex.Matches(htmltag);
foreach (Match m in mtc)
{
if (!list.Contains(m.Groups[1].Value))
{
list.Add(m.Groups[1].Value);
}

}
string[] h = { "h1", "h2", "h3", "h4", "h5", "h6" };//把h标签换成p标签
for (int i = 0; i < list.Count; i++)
{
htmltag = Regex.Replace(htmltag, @"<" + list[i] + " [^>]*?>", "<" + list[i] + ">", RegexOptions.IgnoreCase);
if (h.Contains(list[i]))
{
htmltag = htmltag.Replace("<" + list[i] + ">", "<p>").Replace("</" + list[i] + ">", "</p>");
}
}
result = htmltag.Replace("<img>", "").Replace("<iframe>", "").Replace("</iframe>", "");
return result;
}

/// <summary>
/// 正则匹配
/// </summary>
/// <param name="source">html源码</param>
/// <param name="format">匹配表达式</param>
/// <param name="isFrist">是否取值group[0]与group[1]</param>
/// <returns>匹配的值</returns>
private static string RegexPP(string source, string format, bool isFrist)
{
Regex regexRepostsCount = new Regex(format, RegexOptions.IgnoreCase);
Match m = regexRepostsCount.Match(source);
string rc = "";
rc = m.Groups[0].Value;
if (!isFrist)
{
rc = m.Groups[1].Value;
}

return rc;
}

/// <summary>
/// 获取文章的相关信息
/// </summary>
/// <param name="str">html源码</param>
/// <param name="keywords">关键词</param>
/// <param name="cookies">用到的cookie值</param>
/// <returns>List<WeixinquanMessage>的集合</returns>
private static List<WeixinquanMessage> GetTitleContent(string str,string keywords,string cookies)
{
str = str.Replace("\\r", "").Replace("\\t", "").Replace("\\n", "");
List<WeixinquanMessage> list = new List<WeixinquanMessage>();

kw = new string[1];
kw[0] = keywords;
if (keywords.Contains(" "))
{
kw = new string[2];
kw = keywords.Split(' ');//如何关键词中包含* 之拆分为 两个词
}

//获取整个文章
Regex regexSource = new Regex(@"<div class=""wx-rb wx-rb3"" [^>]*?>(\s|\S)+?(</div>\s*?){3}", RegexOptions.IgnoreCase);//(\s|\S)+?
MatchCollection mtcSource = regexSource.Matches(str);

string[] source = new string[mtcSource.Count];
int i = 0;
foreach (Match m in mtcSource)
{
source[i++] = m.Groups[0].Value;
}

for (int j = 0; j< source.Length; j++)
{
source[j] = source[j].Replace("\n", "");
WeixinquanMessage wm = new WeixinquanMessage();
string a = "";
string divTxtBox = "";
bool isKeyWord = false;//用来判断关键词 是否 存在于标题与内容之间

//匹配标题 与 URL
a = RegexPP(source[j], @"<div class=""txt-box""><h4>([\s|\S]*?)</h4>", false);
//url
wm.Url = "http://weixin.sogou.com" + RegexPP(a, @"<a [^>]*? href=""([\s\S]*?)""[^>]*?>", false).Replace("amp;", "");
//标题
wm.WeixinTitle = ReplaceZhongWenBiaoDian(Regex.Replace(a, @"<[\s\S]*?>", "", RegexOptions.IgnoreCase));

//判断该关键字是否存在于标题当中
if ((wm.WeixinTitle.IndexOf(kw[0]) != -1))
{
isKeyWord = true;
}
if (kw.Length > 1)
{
isKeyWord = false;
if ((wm.WeixinTitle.IndexOf(kw[1]) != -1))
{
isKeyWord = true;
}
}

//时间与作者
//时间
divTxtBox = RegexPP(source[j], @"<div class=""s-p"" [^>]*?>([\s|\S]*?)</a>", true);
string publishTime = RegexPP(divTxtBox, @"<div class=""s-p"" t=""([\d]*?)""[^>]*?>", false);
wm.PublishTime = ConvertStringToDateTime(publishTime);
//微信的用户名
string UserName = RegexPP(source[j], @"<div class=""s-p"" [^>]*?>([\s|\S]*?)</a>", false);
wm.UserName = RegexPP(UserName, @"<a [^>]*? title=""([\s|\S]*?)""[^>]*?>", false);

//通过URL获取文章内容、作者的信息等
string allContent = ch.getHtml(wm.Url, "utf-8", cookies, "", "", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");//ch.Cookie,http://weixin.sogou.com

//获取有效的URL地址
string biz = RegexPP(allContent, @"var biz = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
string sn = RegexPP(allContent, @"var sn = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
string mid = RegexPP(allContent, @"var mid = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
string idx = RegexPP(allContent, @"var idx = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", "");
wm.Url = "http://mp.weixin.qq.com/s?__" + biz + "&" + mid + "&" + idx + "&" + sn + "&3rd=MzA3MDU4NTYzMw==&scene=6#rd";

string headTitle = RegexPP(allContent, @"<div class=""rich_media_meta_list"">(\s|\S)+?(</div>\s*?){2}", true);
//作者
wm.Author = RegexPP(headTitle, @"<em class=""rich_media_meta rich_media_meta_text"">([\s|\S]*?)<em>", false);
if (String.IsNullOrEmpty(wm.Author))
{
wm.Author = "空";
}
//微信号
wm.UserId = RegexPP(headTitle, @"<span class=""profile_meta_value"">([\s|\S]*?)</span>", false);
Random r = new Random();
if (String.IsNullOrEmpty(wm.UserId))//没有微信号 给一个默认的
{
int random = r.Next(0, 1000);
wm.UserId = wm.UserName + (Convert.ToInt64(DateTime.Now.ToString("yyyyMMddHHmmssms")) + random).ToString();
}
//文章内容不包含<p> <br/>标签
wm.WeixinViewContent = "";
string txt = "";
txt = RegexPP(allContent, @"<div class=""rich_media_content ""[^>]*?>([\s|\S]*?)</div>", false);
wm.WeixinViewContent = Regex.Replace(txt, @"<[\s\S]*?>", "", RegexOptions.IgnoreCase);
wm.WeixinViewContent = ReplaceZhongWenBiaoDian(wm.WeixinViewContent.Replace("\\r", "").Replace("\\n", "").Replace("\\t", "").Replace(" ", ""));

//文章内容包含 <p> <br/>标签
string section = "";
section= RegexPP(allContent, @"<div class=""rich_media_content ""[^>]*?>([\s|\S]*?)</div>", true);
wm.WeixinAllContent = RegexReplaceHtmlTag(section);
wm.WeixinAllContent = ReplaceZhongWenBiaoDian(wm.WeixinAllContent.Replace("\\r", "").Replace("\\n", "").Replace("\\t", "").Replace(" ", ""));

if (String.IsNullOrEmpty(wm.WeixinViewContent))
{
wm.WeixinViewContent = "文章内容为空或为图片格式。";
}

//如果该关键字不存在于标题当中,再进行匹配文章内容
if (!isKeyWord)
{
if ((wm.WeixinViewContent.IndexOf(kw[0]) != -1))
{
isKeyWord = true;
}
if (kw.Length > 1)
{
isKeyWord = false;
if ((wm.WeixinViewContent.IndexOf(kw[1]) != -1))
{
isKeyWord = true;
}
}
}

//如果该关键词既不存在于标题中,又不存在与内容中则直接结束本次循环
if (!isKeyWord)
{
continue;
}

list.Add(wm);
}

return list;
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: