您的位置:首页 > 编程语言 > C#

点滴积累【C#】---抓取页面中想要的数据

2015-12-30 22:57 633 查看
效果




描述:此功能是抓取外国的一个检测PM2.5的网站。实时读取网站的数据,然后保存到数据库里面。每隔一小时刷新一次。

地址为:http://beijing.usembassy-china.org.cn/070109air.html

筛选后的地址为:http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&

思路:先抓取到页面的所有数据,保存到txt里面,再一行一行的读取txt,然后用split,substring截取到自己想要的数据,最后保存到数据库,在进行插入数据库的时候查看一下是否已经存在,如果不存在则插入。

代码

using System;
using System.Collections.Generic;
using System.Configuration;
using System.Data;
using System.Data.SqlClient;
using System.IO;
//using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
//using System.Threading.Tasks;

/********************************
* 创建人:青苹果
* 创建时间:2015-12-28
* 描述:获取美利坚合众国的 PM2.5
* ******************************/

namespace GetUSAData
{
class Program
{
//public static string GetURL = System.Configuration.ConfigurationSettings.AppSettings["GetURL"];//获取数据的地址
public static string GetURL = "http://utils.usembassy.gov/feed2js/feed2js.php?src=http%3A%2F%2Fwww.stateair.net%2Fweb%2Frss%2F1%2F1.xml&desc=1&num=7&targ=y&utf=y&pc=y&words=40&";
public static string txtURL = System.Configuration.ConfigurationSettings.AppSettings["txtURL"];//保存为txt文件的路径
public static string conn = ConfigurationManager.ConnectionStrings["ConnectionString"].ToString();

static void Main(string[] args)
{

LoadGO();
}

public static void LoadGO()
{
GetUSA();
List<string[]> getlist = Read(txtURL);
//删除txt
if (File.Exists(txtURL))
{
//如果存在则删除
File.Delete(txtURL);
}
if (getlist.Count > 0)
{
for (int i = getlist.Count-1; i >-1; i--)
{
DateTime dtime = DateTime.Parse(getlist[i][0].ToString());
string getTime = dtime.ToString("yyyy-MM-dd HH:mm");
string controlTime = dtime.ToString("yyyy-MM-dd");
float LatestHourdata1 = float.Parse(getlist[i][2]);
int LatestHourdata2 = Convert.ToInt32(getlist[i][3]);
float Avgdata1 = 0;
int Avgdata2 = 0;
string Avgdata3 = getlist[i][4].ToString();

List<SqlParameter> listWhere = new List<SqlParameter>();
listWhere.Add(new SqlParameter("@strDatetime", controlTime));
string sqlSelect = @"SELECT count(*) as allcount,sum(LatestHourdata1) as LatestHourdata1_avg, sum(LatestHourdata2) as LatestHourdata2_avg
FROM T_twitter  where ([LatestHourdata1] is not null
or [LatestHourdata2] is not null or [Avgdata1] is not null
or [AvgData2] is not null) and   CONVERT(varchar(100), [datetime], 23)=@strDatetime";

DataTable sumDT = ControlDB(sqlSelect, listWhere, "select");    //查询总和用于计算日均值
if (sumDT.Rows.Count > 0)
{
foreach (DataRow itemDR in sumDT.Rows)
{
int allcount = Convert.ToInt32(itemDR["allcount"].ToString());    //数据库中当前日期数量总和
if (allcount > 0)
{
if (itemDR["LatestHourdata1_avg"] != null)
{
Avgdata1 = float.Parse(itemDR["LatestHourdata1_avg"].ToString());   //数据库中LatestHourdata1_avg总和
Avgdata1 = (Avgdata1 + LatestHourdata1) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值
}
if (itemDR["LatestHourdata2_avg"] != null)
{
Avgdata2 = Convert.ToInt32(itemDR["LatestHourdata2_avg"].ToString());   //数据库中LatestHourdata2_avg总和
Avgdata2 = (Avgdata2 + LatestHourdata2) / (allcount + 1);//(数据库的总和+最新的一条)/(数据库的总和数量+1)=日平均值
}
//根据网站规则判断PM2.5的平均严重性

if (Avgdata2 >= 0 && Avgdata2 <= 50)
{
Avgdata3 = " Good (at 24-hour exposure at this level)";
}
else if (Avgdata2 >= 51 && Avgdata2 <= 100)
{
Avgdata3 = " Moderate (at 24-hour exposure at this level)";
}
else if (Avgdata2 >= 101 && Avgdata2 <= 150)
{
Avgdata3 = " Unhealthy for Sensitive Groups (at 24-hour exposure at this level)";
}
else if (Avgdata2 >= 151 && Avgdata2 <= 200)
{
Avgdata3 = " Unhealthy (at 24-hour exposure at this level)";
}
else if (Avgdata2 >= 201 && Avgdata2 <= 300)
{
Avgdata3 = " Very Unhealthy (at 24-hour exposure at this level)";
}
else
{
Avgdata3 = " Hazardous (at 24-hour exposure at this level)";
}
}
else
{
Avgdata1 = LatestHourdata1;
Avgdata2 = LatestHourdata2;
}
}
}

List<SqlParameter> pars = new List<SqlParameter>();
pars.Add(new SqlParameter("@whereDatetime", getTime));
pars.Add(new SqlParameter("@datetime", getTime));
pars.Add(new SqlParameter("@LatestHourdata1", LatestHourdata1));
pars.Add(new SqlParameter("@LatestHourdata2", LatestHourdata2));
pars.Add(new SqlParameter("@LatestHourdata3", getlist[i][4].ToString()));
pars.Add(new SqlParameter("@Avgdata1", Avgdata1));
pars.Add(new SqlParameter("@Avgdata2", Avgdata2));
pars.Add(new SqlParameter("@Avgdata3", Avgdata3));

string sql = @"if not exists(select * from  dbo.T_twitter where  datetime=@whereDatetime) begin
insert T_twitter (datetime,LatestHourdata1,LatestHourdata2,LatestHourdata3,Avgdata1,AvgData2,AvgData3)
VALUES(@datetime,@LatestHourdata1,@LatestHourdata2,@LatestHourdata3,@Avgdata1,@Avgdata2,@Avgdata3) end";
ControlDB(sql, pars, "");//插入数据
}
}
}

/// <summary>
/// 获取页面数据保存至txt
/// </summary>
public static void GetUSA()
{
WebRequest request = WebRequest.Create(GetURL);
WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
//reader.ReadToEnd() 表示取得网页的源码

FileStream fs = new FileStream(txtURL, FileMode.Create);
byte[] data = System.Text.Encoding.Default.GetBytes(reader.ReadToEnd());
//开始写入
fs.Write(data, 0, data.Length);
//清空缓冲区、关闭流
fs.Flush();
fs.Close();
}

/// <summary>
/// 根据路径读取txt文件
/// </summary>
/// <param name="path">txt路径</param>
/// <returns></returns>
public static List<string[]> Read(string path)
{
List<string[]> list = new List<string[]>();
StreamReader sr = new StreamReader(path, Encoding.Default);
String line;
while ((line = sr.ReadLine()) != null)
{
int i = line.ToString().IndexOf("title");
if (i > 0)
{
string titleStr = line.ToString().Substring(i + 7); //截取到title后面的值
string[] titlelist = titleStr.Split('"');        //以"  截取
string titledata = titlelist[0];
string[] datalist = titledata.Split('&');  //以& 截取
string data = datalist[0];
string[] datastrlist = data.Split(new char[] { ';' }, StringSplitOptions.RemoveEmptyEntries);//以; 截取
list.Add(datastrlist);
}
}
sr.Close();
return list;
}

/// <summary>
/// 增查表
/// </summary>
/// <returns></returns>
public static DataTable ControlDB(string sql, List<SqlParameter> par, string type)
{
DataAccess controData = new DataAccess();
DataTable resultDT = new DataTable();
if (type == "select")
{
resultDT = controData.GetDataTable(sql, par.ToArray());
}
else
{
int result = controData.ExecuteSql(sql, par.ToArray());
}
return resultDT;
}
}
}


Demo下载:

http://files.cnblogs.com/files/xinchun/GetUSAData.zip
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: