获取搜狗引擎 添加任意关键词 后的html源码(c#)
2016-03-22 21:45
567 查看
一直在不断测试:希望有大神指示如何获取 搜狗微信文章获取)
class Program { private static ManageKeywordBll mkBll = new ManageKeywordBll(); private static WeixinquanMessageBll wqBll = new WeixinquanMessageBll(); private static CookieHelp ch = new CookieHelp(); private static string userAgentLiShi = ""; private static string result = "";//返回html结果 private static string[] kw;//关键词 private static string id = "";//获取 验证码返回的 id用作SUNID的cookie值 static void Main(string[] args) { #region 开始 ManageKeyword model = new ManageKeyword(); List<ManageKeyword> list = new List<ManageKeyword>(); string cookieAll = "";//防止cookie信息丢失,URL链接失效问题 model.Module = "6,"; list = mkBll.GetList(model); foreach (ManageKeyword data in list) { List<WeixinquanMessage> listWeixin = new List<WeixinquanMessage>(); string url = ""; string keyword = data.KeyWord; if (keyword.IndexOf("*") != -1) { keyword = keyword.Replace("*", " "); } //long datetime = ConvertDateTimeToInt(true, DateTime.Now);//13位时间戳 //long datetime16 = ConvertDateTimeToInt(false, DateTime.Now);//16位时间戳 url = "http://weixin.sogou.com/weixin?" + "query=" + keyword + "&" + "_sug_type_=1&" + "sut=0&" + "sourceid=inttime_all&"//当天的内容inttime_day 全部时间的inttime_all + "ri=0&" + "_sug_=n&" + "type=2&"//type=2 微信号 type=1 公众号 + "ie=utf-8&" //+ "sst0=" + datetime + "&" + "interation=&" //+ "interV=kKIOkrELjboJmLkElbYTkKIKmbELjbkRmLkElbk%3D_1893302304&" + "tsn=0&"//0:全部时间,1:一天内,2:一星期内 + "page=(*)"; //+ "dp=1"; //url = HttpUtility.UrlEncode(url); cookieAll = GetHtmlByYzm(url, keyword); //由于读取一次URL之后有10条新闻需要处理 在这处理10新闻的间隙中 //cookiecontianer或许保存了过期的SUID,与之前读出URL是产生的SUID不一致 导致 访问内页新闻详情时链接失效 listWeixin = GetTitleContent(result, keyword, cookieAll); foreach (WeixinquanMessage wm in listWeixin) { wm.KeyWord = data.KeyWord; wqBll.Add(wm); } cookieAll = "";//清空一下 url = "";//清空一下 } #endregion Console.WriteLine("OK"); Console.ReadKey(); } /// <summary> /// 将日期转化为Unix时间戳 /// </summary> /// <param name="time">时间日期格式</param> /// <returns>长整型数据</returns> private static long ConvertDateTimeToInt(bool flag, System.DateTime time) { long t = 0; System.DateTime startTime = TimeZone.CurrentTimeZone.ToLocalTime(new System.DateTime(1970, 1, 1, 0, 0, 0, 0)); if (flag) { t = (time.Ticks - startTime.Ticks) / 10000; //除10000调整为13位 } else { t = time.Ticks - startTime.Ticks;//16位 } return t; } /// <summary> /// 将时间戳转化为时间日期格式 /// </summary> /// <param name="timeStamp"></param> /// <returns></returns> private static DateTime ConvertStringToDateTime(string timeStamp) { DateTime dtStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1)); long lTime = long.Parse(timeStamp + "0000000"); TimeSpan toNow = new TimeSpan(lTime); return dtStart.Add(toNow); } /// <summary> /// 随机生成一个useragent /// </summary> /// <returns></returns> private static void RandomUserAgent() { Random r = new Random(); int random = r.Next(0, UserAgentList.userAgentShuZu.Length); userAgentLiShi = UserAgentList.userAgentShuZu[random]; } /// <summary> /// 获取验证码返回的id值 /// </summary> /// <param name="keywords">关键词</param> /// <returns>id值</returns> private static string GetSNUID(string keywords) { string id = ""; string referer = ""; for (int j = 0; j < 10; j++) { ch.MyCookieContainer = new CookieContainer();//清空掉原有的cookie ch.Cookie = "";//清空掉原有的cookie long tc = ConvertDateTimeToInt(true, DateTime.Now) / 1000; string imageUrl = "http://weixin.sogou.com/antispider/util/seccode.php?tc=" + tc.ToString() + "";//图片的src地址 Image image = ch.getImage(imageUrl, "UTF-8"); CodeFactory cf = new CodeFactory(CodeType.YDM); CodeModel cm = cf.GetCode(image); string url = "http://weixin.sogou.com/antispider/thank.php"; string postdata = "c=" + cm.Code + "&r=/weixin?type=2&query=" + keywords + "&ie=utf8&_sug_=n&_sug_type_=&v=5"; referer = HttpUtility.UrlEncode("http://weixin.sogou.com/antispider/?from=/weixin?type=2&query=" + keywords + "&ie=utf8&_sug_=n&_sug_type_="); id = ch.PostAndGetResult(url, "UTF-8", postdata, "", "", referer); if (id.IndexOf("解封成功,正在为您跳转来源地址...") > -1) { break; } } id = id.Substring(id.IndexOf("id")).Replace("\"", "").Replace(":", "").Replace(" ", ""); id = id.Substring(2, id.Length - 3); return id; } /// <summary> /// 通过破解验证码的方式获取网页源码 /// </summary> /// <param name="urls">要访问的URL</param> /// <param name="keywords">要访问的关键词</param> /// <returns>cookie值</returns> private static string GetHtmlByYzm(string urls,string keywords) { bool isOk = false; string cookies = ""; result = ch.getHtml(urls, "UTF-8", "", "", "", " Mozilla / 5.0(Windows NT 10.0; WOW64; rv: 45.0) Gecko / 20100101 Firefox / 45.0"); cookies = ch.Cookie; for ( int i = 0; i < 10; i++) { isOk = true; if (result.IndexOf("请输入验证码") > -1) { isOk = false; if (cookies.IndexOf("SNUID") > -1) { cookies = cookies.Substring(0, cookies.IndexOf("SNUID") - 1); } //经测试如果出现验证码 seccodeRight=success;refresh=1,必须得用 successCount =1|Thu, 31 Mar 2016 07:58:56 GMT; 可以不用 cookies += ";SUV=00677DCD6F11A86256FCD05375656513;seccodeRight=success;refresh=1;SNUID=" + id; result = ch.getHtml(urls, "UTF-8", cookies, "", "", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");//Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0 } //解决出现验证码 获取一次id 可执行4-5次 if (result.IndexOf("请输入验证码") > -1) { id = GetSNUID(keywords); } if (isOk) { break; } } return cookies; } /// <summary> /// 把中文的标点符号转换回来 /// </summary> /// <param name="parm"></param> /// <returns></returns> private static string ReplaceZhongWenBiaoDian(string parm) { string result = ""; string[] a = { " ", "&", """, "'", "“", "”", "—", "<", ">", "·", "…" }; string[] b = { " ", "&", "\"", "'", "“", "”", "—", "<", ">", "·", "…" }; for (int i = 0; i < a.Length; i++) { if (parm.Contains(a[i])) { parm = parm.Replace(a[i], b[i]); } } result = parm; return result; } /// <summary> /// 把html 标签里的class style 全部去掉 /// </summary> /// <param name="htmltag"></param> /// <returns></returns> private static string RegexReplaceHtmlTag(string htmltag) { string result = ""; List<string> list = new List<string>(); Regex regex = new Regex(@"<([a-z|A-Z|0-9]+?) [^>]*?>", RegexOptions.IgnoreCase); MatchCollection mtc = regex.Matches(htmltag); foreach (Match m in mtc) { if (!list.Contains(m.Groups[1].Value)) { list.Add(m.Groups[1].Value); } } string[] h = { "h1", "h2", "h3", "h4", "h5", "h6" };//把h标签换成p标签 for (int i = 0; i < list.Count; i++) { htmltag = Regex.Replace(htmltag, @"<" + list[i] + " [^>]*?>", "<" + list[i] + ">", RegexOptions.IgnoreCase); if (h.Contains(list[i])) { htmltag = htmltag.Replace("<" + list[i] + ">", "<p>").Replace("</" + list[i] + ">", "</p>"); } } result = htmltag.Replace("<img>", "").Replace("<iframe>", "").Replace("</iframe>", ""); return result; } /// <summary> /// 正则匹配 /// </summary> /// <param name="source">html源码</param> /// <param name="format">匹配表达式</param> /// <param name="isFrist">是否取值group[0]与group[1]</param> /// <returns>匹配的值</returns> private static string RegexPP(string source, string format, bool isFrist) { Regex regexRepostsCount = new Regex(format, RegexOptions.IgnoreCase); Match m = regexRepostsCount.Match(source); string rc = ""; rc = m.Groups[0].Value; if (!isFrist) { rc = m.Groups[1].Value; } return rc; } /// <summary> /// 获取文章的相关信息 /// </summary> /// <param name="str">html源码</param> /// <param name="keywords">关键词</param> /// <param name="cookies">用到的cookie值</param> /// <returns>List<WeixinquanMessage>的集合</returns> private static List<WeixinquanMessage> GetTitleContent(string str,string keywords,string cookies) { str = str.Replace("\\r", "").Replace("\\t", "").Replace("\\n", ""); List<WeixinquanMessage> list = new List<WeixinquanMessage>(); kw = new string[1]; kw[0] = keywords; if (keywords.Contains(" ")) { kw = new string[2]; kw = keywords.Split(' ');//如何关键词中包含* 之拆分为 两个词 } //获取整个文章 Regex regexSource = new Regex(@"<div class=""wx-rb wx-rb3"" [^>]*?>(\s|\S)+?(</div>\s*?){3}", RegexOptions.IgnoreCase);//(\s|\S)+? MatchCollection mtcSource = regexSource.Matches(str); string[] source = new string[mtcSource.Count]; int i = 0; foreach (Match m in mtcSource) { source[i++] = m.Groups[0].Value; } for (int j = 0; j< source.Length; j++) { source[j] = source[j].Replace("\n", ""); WeixinquanMessage wm = new WeixinquanMessage(); string a = ""; string divTxtBox = ""; bool isKeyWord = false;//用来判断关键词 是否 存在于标题与内容之间 //匹配标题 与 URL a = RegexPP(source[j], @"<div class=""txt-box""><h4>([\s|\S]*?)</h4>", false); //url wm.Url = "http://weixin.sogou.com" + RegexPP(a, @"<a [^>]*? href=""([\s\S]*?)""[^>]*?>", false).Replace("amp;", ""); //标题 wm.WeixinTitle = ReplaceZhongWenBiaoDian(Regex.Replace(a, @"<[\s\S]*?>", "", RegexOptions.IgnoreCase)); //判断该关键字是否存在于标题当中 if ((wm.WeixinTitle.IndexOf(kw[0]) != -1)) { isKeyWord = true; } if (kw.Length > 1) { isKeyWord = false; if ((wm.WeixinTitle.IndexOf(kw[1]) != -1)) { isKeyWord = true; } } //时间与作者 //时间 divTxtBox = RegexPP(source[j], @"<div class=""s-p"" [^>]*?>([\s|\S]*?)</a>", true); string publishTime = RegexPP(divTxtBox, @"<div class=""s-p"" t=""([\d]*?)""[^>]*?>", false); wm.PublishTime = ConvertStringToDateTime(publishTime); //微信的用户名 string UserName = RegexPP(source[j], @"<div class=""s-p"" [^>]*?>([\s|\S]*?)</a>", false); wm.UserName = RegexPP(UserName, @"<a [^>]*? title=""([\s|\S]*?)""[^>]*?>", false); //通过URL获取文章内容、作者的信息等 string allContent = ch.getHtml(wm.Url, "utf-8", cookies, "", "", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");//ch.Cookie,http://weixin.sogou.com //获取有效的URL地址 string biz = RegexPP(allContent, @"var biz = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", ""); string sn = RegexPP(allContent, @"var sn = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", ""); string mid = RegexPP(allContent, @"var mid = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", ""); string idx = RegexPP(allContent, @"var idx = (\s|\S)+?;", true).Replace("||", "").Replace("\"", "").Replace(" ", "").Replace("var", "").Replace(";", ""); wm.Url = "http://mp.weixin.qq.com/s?__" + biz + "&" + mid + "&" + idx + "&" + sn + "&3rd=MzA3MDU4NTYzMw==&scene=6#rd"; string headTitle = RegexPP(allContent, @"<div class=""rich_media_meta_list"">(\s|\S)+?(</div>\s*?){2}", true); //作者 wm.Author = RegexPP(headTitle, @"<em class=""rich_media_meta rich_media_meta_text"">([\s|\S]*?)<em>", false); if (String.IsNullOrEmpty(wm.Author)) { wm.Author = "空"; } //微信号 wm.UserId = RegexPP(headTitle, @"<span class=""profile_meta_value"">([\s|\S]*?)</span>", false); Random r = new Random(); if (String.IsNullOrEmpty(wm.UserId))//没有微信号 给一个默认的 { int random = r.Next(0, 1000); wm.UserId = wm.UserName + (Convert.ToInt64(DateTime.Now.ToString("yyyyMMddHHmmssms")) + random).ToString(); } //文章内容不包含<p> <br/>标签 wm.WeixinViewContent = ""; string txt = ""; txt = RegexPP(allContent, @"<div class=""rich_media_content ""[^>]*?>([\s|\S]*?)</div>", false); wm.WeixinViewContent = Regex.Replace(txt, @"<[\s\S]*?>", "", RegexOptions.IgnoreCase); wm.WeixinViewContent = ReplaceZhongWenBiaoDian(wm.WeixinViewContent.Replace("\\r", "").Replace("\\n", "").Replace("\\t", "").Replace(" ", "")); //文章内容包含 <p> <br/>标签 string section = ""; section= RegexPP(allContent, @"<div class=""rich_media_content ""[^>]*?>([\s|\S]*?)</div>", true); wm.WeixinAllContent = RegexReplaceHtmlTag(section); wm.WeixinAllContent = ReplaceZhongWenBiaoDian(wm.WeixinAllContent.Replace("\\r", "").Replace("\\n", "").Replace("\\t", "").Replace(" ", "")); if (String.IsNullOrEmpty(wm.WeixinViewContent)) { wm.WeixinViewContent = "文章内容为空或为图片格式。"; } //如果该关键字不存在于标题当中,再进行匹配文章内容 if (!isKeyWord) { if ((wm.WeixinViewContent.IndexOf(kw[0]) != -1)) { isKeyWord = true; } if (kw.Length > 1) { isKeyWord = false; if ((wm.WeixinViewContent.IndexOf(kw[1]) != -1)) { isKeyWord = true; } } } //如果该关键词既不存在于标题中,又不存在与内容中则直接结束本次循环 if (!isKeyWord) { continue; } list.Add(wm); } return list; } }
相关文章推荐
- html页面内容超出后显示水平滚动条的问题
- sitemesh html 本地乱码服务器正常
- HTML基础篇--网页布局
- jq val() 和 html() 用法注意
- html 最简遮罩层
- 如何在HTML中调用百度地图API
- 利用html 5 websocket做个山寨版web聊天室(手写C#服务器)
- html中事件处理中的this和event对象
- HTML编码规范
- HTML meta标签总结与属性使用介绍
- html第四节课
- C# HTML转换为WORD
- Markdown+Pandoc→HTML幻灯片速成
- HTML笔试题大全
- HTML基本语法
- HTML特殊字符编码对照表
- 使用HTML.ActionLink实现一个图片链接
- html 虚线
- 优化网页速度的7种方法
- HTML编码规范