您的位置:首页 > 其它

设计自动获取网页和提交表单组件

2005-12-14 09:38 621 查看
一次做列车时刻表时想到从网页上抓取数据,设计了这个类。

///网页获取器
class HtmlGetter
{
public:
HtmlGetter();
HtmlGetter::HtmlGetter(CString name) ;
CString get_one_page(std::string url);
~HtmlGetter();
CString get_page_by_post(std::vector<CString> inputs, CString pre_url);
CString get_page_by_get(std::vector<CString> inputs, CString pre_url);
void get_file(std::string url, CString SaveToFile);
CString get_last_header();
void Close();
private:
CString LastHeader;
CInternetSession session;
};

使用方法

HtmlGetter htmlgetter;
CString Page1=htmlgetter.get_one_page(http://163.com);

//设置好网页form的inputs
vector<CString> cmds;
cmds.push_back("pawsoperator");//input name
cmds.push_back("1");//input value
……

//post,
htmlgetter.get_page_by_post(cmds,"http://......");

实现如下:

#include "stdafx.h"

#include "string"
#include "vector"
#include <iostream>
#include <fstream>
#include "./htmlgetter.h"
using namespace std;
//char HtmlGetter::SessSerial=0;
//CString HtmlGetter::SessName;
HtmlGetter::HtmlGetter() : session("session_")
{
}
HtmlGetter::HtmlGetter(CString name) : session(name)
{
}
HtmlGetter::~HtmlGetter()
{
session.Close();
}
void HtmlGetter::Close()
{
session.Close();
}
CString HtmlGetter::get_last_header()
{
return LastHeader;
}
///获取网页
CString HtmlGetter::get_one_page(std::string url)
{
CStdioFile* pFile = NULL;
char szBuff[1024];
CString Ret;
try
{
pFile = session.OpenURL(url.c_str());
}
catch (CInternetException* )
{
logger->warn("OpenURL exception");
return CString(/*"<html><body>*/"OpenURL exception. ");/*I will retry " +
GetConfString("general", "retry") +
" times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
}
//LPVOID headbuf=NULL;
//DWORD len;
CString header1;
//((CHttpFile*)pFile)->QueryInfo(HTTP_QUERY_FLAG_REQUEST_HEADERS,headbuf,&len);
if (((CHttpFile *) pFile)->QueryInfo(HTTP_QUERY_RAW_HEADERS_CRLF, header1) ==
TRUE)
{
logger->debugStream() << "http header:" << header1;
LastHeader = header1;
//delete[]headbuf;
}
memset(szBuff, 0, sizeof(szBuff));
UINT nRead = pFile->Read(szBuff, 1023);
while (nRead > 0)
{
Ret += (szBuff);
memset(szBuff, 0, sizeof(szBuff));
nRead = pFile->Read(szBuff, 1023);
}
delete pFile;
return Ret;
}
/**下载文件
@return 文件
@author 张浩
*/
void HtmlGetter::get_file(std::string url, CString SaveToFile)
{
//下载文件必须用二进制打开
std::ofstream savefile(SaveToFile, ios::binary | ios::out);
CStdioFile* pFile = NULL;
char szBuff[1024];
try
{
pFile = session.OpenURL(url.c_str());
}
catch (CInternetException* )
{
logger->warn("OpenURL exception");
return ;//CString("<html><body>OpenURL exception</body></html>");
}
memset(szBuff, 0, sizeof(szBuff));
UINT nRead = pFile->Read(szBuff, 1023);
while (nRead > 0)
{
//Ret += (szBuff);
savefile.write(szBuff, nRead);
memset(szBuff, 0, sizeof(szBuff));
nRead = pFile->Read(szBuff, 1023);
}
delete pFile;
//return Ret;
}
/**POST提交后返回页面
@param inputs name value 对
@param pre_url 提交至action页面
@return 页面
@author 张浩
*/
CString HtmlGetter::get_page_by_post(std::vector<CString> inputs,
CString pre_url)
{
//pre_url http://ffsfsdfdsfs:80/fsdfs/fsd.asp
int pos = pre_url. Find('/'), pos2;
pos2 = pos;
if (pre_url.Find('/', pos + 1) == pos + 1)
pos2 = pre_url.Find('/', pos + 2);
CString server = (pos == pos2) ?
(pre_url.Left(pos)) :
(pre_url.Mid(pos + 2, pos2 - pos - 2));
//server ffsfsdfdsfs:80
CString actionform = pre_url.Mid(pos2);
//actionform /fsdfs/fsd.asp
CString Port = "80";
if (server.Find(':') != -1)
{
Port = server.Mid(server.Find(':') + 1);
server = server.Left(server.Find(':'));
}
INTERNET_PORT port = atoi(Port);

logger->debugStream() << "actionform " << actionform;
CString strHeaders = _T("Content-Type: application/x-www-form-urlencoded");
// URL-encoded form variables -
// name = "John Doe", userid = "hithere", other = "P&Q"

//url加密
CString params;
if ((inputs.size() % 2) != 0)//不是2的倍数,说明解析错?
{
logger->error("parse post inputs error ");//of << "parse cmd error" << endl;
return CString("<html><body>parse post inputs error. ");/*I will retry " +
GetConfString("general", "retry") +
" times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
}
for (vector<CString>::iterator i = inputs.begin();
i != inputs.end();
++i,++i)
{
if (i != inputs.begin())
params += '&';
params += UrlEncode(i->GetBuffer(i->GetLength()));
logger->debugStream() << "UrlEncode param1:"
<< i->GetBuffer(i->GetLength()) << " to "
<< UrlEncode(i->GetBuffer(i->GetLength()));
params += '=';
vector<CString>::iterator j = i;++j;
params += UrlEncode(j->GetBuffer(j->GetLength()));
logger->debugStream() << "UrlEncode param2:"
<< j->GetBuffer(j->GetLength()) << " to "
<< UrlEncode(j->GetBuffer(j->GetLength()));
}
CString strFormData = params;//_T(params.GetBuffer(params.GetLength())/* "search_txt=xmlspy"*/);
logger->debugStream() << "strFormData: " << strFormData;
CString Ret;
try
{
//CInternetSession session;
CHttpConnection* pConnection = session.GetHttpConnection(server/*_T("ServerNameHere")*/,
port);
CHttpFile* pFile = pConnection->OpenRequest(CHttpConnection::HTTP_VERB_POST,
actionform/*_T("FormActionHere")*/);
BOOL result = pFile->SendRequest(strHeaders,
(LPVOID) (LPCTSTR) strFormData,
strFormData.GetLength());
char szBuff[1024];
memset(szBuff, 0, sizeof(szBuff));
UINT nRead = pFile->Read(szBuff, 1023);
while (nRead > 0)
{
Ret += (szBuff);
memset(szBuff, 0, sizeof(szBuff));
nRead = pFile->Read(szBuff, 1023);
}
delete pFile;
}
catch (CInternetException* )
{
logger->warn("get_page_by_post exception");
return CString("<html><body>get_page_by_post exception .");/* maybe url is not exist. I will retry " +
GetConfString("general", "retry") +
" times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
}

return Ret;
}
/**get提交后返回页面
方法:先将参数url编码,再调用get_one_page
@param inputs name value 对
@param pre_url 提交至action页面
@return 页面
@author 张浩
*/
CString HtmlGetter::get_page_by_get(std::vector<CString> inputs,
CString pre_url)
{
//url加密
CString params;
if ((inputs.size() % 2) != 0)//不是2的倍数,说明解析错?
{
logger->error("parse get inputs error ");//of << "parse cmd error" << endl;
return CString("<html><body>parse get inputs error.");/* I will retry " +
GetConfString("general", "retry") +
" times.<br><br><br>PMKPkZ1AFf08CkMUPgvMuJiw</body></html>");*/
}
for (vector<CString>::iterator i = inputs.begin();
i != inputs.end();
++i,++i)
{
if (i != inputs.begin())
params += '&';
params += UrlEncode(i->GetBuffer(i->GetLength()));
logger->debugStream() << "UrlEncode param1:"
<< i->GetBuffer(i->GetLength()) << " to "
<< UrlEncode(i->GetBuffer(i->GetLength()));
params += '=';
vector<CString>::iterator j = i;++j;
params += UrlEncode(j->GetBuffer(j->GetLength()));
logger->debugStream() << "UrlEncode param2:"
<< j->GetBuffer(j->GetLength()) << " to "
<< UrlEncode(j->GetBuffer(j->GetLength()));
}
std::string newurl = pre_url + "?" + params;
logger->debugStream() << "newurl: " << newurl;

return this->get_one_page(newurl);
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: