您的位置:首页 > 理论基础 > 计算机网络

一个小型浏览器架构:HTTP通讯、COOKIE处理、HTML解析、JS模拟、表单处理

2010-09-04 14:42 676 查看
前段时间作了一个HTML的解析类,方便在蜘蛛、信息发布、小偷程序中正确抓取网页内容。

有人可能会说,为啥不用Webbrowser呢。

1,首先是效率问题,Webbrowser太慢了。

2,你无法操作Webbrowser上传一个文件。

3,你无法操作Webbrowser跨域的Iframe

4,你不能不按套路出牌,例如页面setTimeout 100秒输出一个div,你也只能100秒后才能获取到。

5,如果你只想在后台解析HTML,不想让用户看到浏览器,Webbrowser会很碍事。

6,阻止浏览器下载图片、CSS、Flash不是一件容易的事情,而这写内容通常没啥用。

。。。。

因为是一个商业软件,不太方便直接公开全部源代码,列出各头大家了解一下吧。如果大家比较感兴趣,稍后我会核心部分做一个说明。

HTTP通讯部分

const
DoNotPush:set of TLabelEnum=[csTitle,csImg,csStyle,csScript,csInput,csTextArea];

TLabelNames:Array [TLabelEnum] of  String = (
'a','img','style','link','script','title','form','input','select',
'button','textarea','option','');

DelimiterWord:array[0..6] of String = ('','//','/*','*/','');
DelimiterChar= ['<','>','/','\','=','&',';','"','''',' ',#9,#13,#10];
WhiteSpaceChar= [' ',#9,#13,#10];

type

TFindin = (fiAll,fiForms,fiFields,fiAnchors,fiImages,fiStyles,fiScripts);

THTMLDocument = class;
THTMLParser = class;

THTMLElementCollection = class;
THTMLElement = class
private
FID:String;
FName:String;
FTagName:String;
FParent:THTMLElement;
FChildElements:THTMLElementCollection;
FDocument:THTMLDocument;
FAttributes:TDictionary;
procedure Assign(Target:THTMLElement); virtual;
function GetInnerHTML:String;
procedure SetInnerHTML(val:String);
function GetOuterHTML:string;
procedure SetOuterHTML(val:String);
procedure AppendChildElement(Element:THTMLElement);
procedure RemoveChildElement(Element:THTMLElement);
public
constructor Create(Parent:THTMLElement); virtual;
destructor Destroy; override;
function Clone:THTMLElement; virtual;
procedure SetAttribute(AName,AValue:String); virtual;
function GetAttribute(AName:String):String; virtual;
function GetElementById(AID:String):THTMLElement;
function GetElementsByName(AName:String):THTMLElementCollection;
function GetElementsByTagName(ATagName:String):THTMLElementCollection;
function HasChildElement(Child:THTMLElement):Boolean;

property Document:THTMLDocument read FDocument write FDocument;
property Attributes: TDictionary read FAttributes;
property ChildElements: THTMLElementCollection read FChildElements;
property Attribute[Name:String]:string read GetAttribute write SetAttribute;
property Parent:THTMLElement read FParent;
property ParentNode:THTMLElement read FParent;
property ID:String read FID write FID;
property Name:String read FName write FName;
property TagName:String read FTagName write FTagName;
property InnerHTML:String read GetInnerHTML write SetInnerHTML;
property OuterHTML:String read GetOuterHTML write SetOuterHTML;
end;
THTMLClass = class of THTMLElement;

THTMLCollection = class(TList)
private
FDocument:THTMLDocument;
function GetCount: Integer;
public
function Item(Index:Integer): T;
property Length:Integer read GetCount;
property Document:THTMLDocument read FDocument write FDocument;
end;

THTMLScript = class(THTMLElement)
private
FSrc:String;
FType:String;
FText:String;
FLanguage:String;
public
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;

property Src:String read FSrc write FSrc;
property AType:String read FType write FType;
property Text:String read FText write FText;
property Language:String read FLanguage write FLanguage;
end;

THTMLStyle = class(THTMLElement)
private
FCSSText :String;
public
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
property CSSText:String read FCSSText write FCSSText;
end;

THTMLImage = class(THTMLElement)
private
FAlt:String;
FTitle:String;
FSrc:String;
function GetTextValue: string;
public
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
property Alt:String read FAlt write FAlt;
property Title:String read FTitle write FTitle;
property Src:String read FSrc write FSrc;
property TextValue:string read GetTextValue;
end;

THTMLAnchor =class(THTMLElement)
private
FHref:String;
FDisplay:String;
public
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
property Href:String read FHref write FHref;
property Display:String read FDisplay write FDisplay;
end;

THTMLForm = class;
THTMLField = class(THTMLElement)
private
FValue:String;
FDisplay:String;
procedure Assign(Target:THTMLElement); override;
function GetActive:Boolean; virtual; abstract;
protected
FParentForm:THTMLForm;
public
destructor Destroy; override;
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
property Active:Boolean read GetActive;
property ParentForm:THTMLForm read FParentForm;
property Value:String read FValue write FValue;
property Display:String read FDisplay write FDisplay;
end;

THTMLTextArea = class(THTMLField)
private
function GetActive:Boolean; override;
public
procedure Random;
end;

THTMLButton = class(THTMLField)
private
FType:String;
procedure Assign(Target:THTMLElement); override;
function GetActive:Boolean; override;
public
constructor Create(Parent:THTMLElement); override;
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
property AType:String read FType write FType;
end;

THTMLSelect = class;
THTMLOption = class(THTMLField)
private
FSelected: Boolean;
FParentSelect: THTMLSelect;
procedure Assign(Target:THTMLElement); override;
function GetActive:Boolean; override;
procedure SetSelected(val:Boolean);
public
destructor Destroy; override;
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
property ParentSelect:THTMLSelect read FParentSelect;
property Selected: Boolean read FSelected write SetSelected;
end;

THTMLInput = class(THTMLField)
private
FType:String;
FChecked: Boolean;
procedure Assign(Target:THTMLElement); override;
function GetActive:Boolean; override;
procedure SetChecked(val:Boolean);
public
constructor Create(Parent:THTMLElement); override;
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
procedure Random;
property AType:string read FType write FType;
property Checked: Boolean read FChecked write SetChecked;
end;

THTMLOptionCollection = class;
THTMLSelect = class(THTMLField)
private
FSelected: THTMLOption;
FOptions: THTMLOptionCollection;
procedure Assign(Target:THTMLElement); override;
function GetActive:Boolean; override;
procedure SetSelected(val: THTMLOption);
procedure SetParentFrom(const Value: THTMLForm);
public
constructor Create(Parent:THTMLElement); override;
destructor Destroy; override;

procedure Random;
procedure AddOption(val:THTMLOption);
procedure Notify(Element: THTMLOption; Operation: TOperation);
property ParentFrom:THTMLForm read FParentForm write SetParentFrom;
property Selected:THTMLOption read FSelected write SetSelected;
property Options:THTMLOptionCollection read FOptions;
end;

TRadioFields = Array of THTMLInput;
THTMLFieldCollection = class;

THTMLForm = class(THTMLElement)
private
FMethod:String;
FENCType:String;
FAction:String;
FReferer:String;
FFields:THTMLFieldCollection;
procedure Assign(Target:THTMLElement); override;
function GetRadioFields(AName:String):TRadioFields;
function GetActiveFields(AName:String):THTMLField;
function GetFieldValues(AName:String):String;
procedure SetFieldValues(AName,AValue:String);
public
constructor Create(Parent:THTMLElement); override;
destructor Destroy; override;

procedure AddField(val:THTMLField);
procedure Notify(Element: THTMLField; Operation: TOperation);
function Find(Keys,Values:string):THTMLField; overload;
function Find(Keys,Values:string; Match:TMatch):THTMLField; overload;
function Find(Keys,Values:array of string):THTMLField; overload;
function Find(Keys,Values:array of string; Match:TMatch):THTMLField; overload;
function GetAttribute(AName:String):String; override;
procedure SetAttribute(AName,AValue:String); override;
function Submit(PostField:array of String; const ButtonName:string=''):Integer;
function CheckBox(Checked:Boolean):Boolean; overload;
function CheckBox(AName:String; Checked:Boolean):Boolean; overload;
property Fields:THTMLFieldCollection read FFields;
property ActiveFields[Name:String]:THTMLField read GetActiveFields;
property RadioFields[Name:String]:TRadioFields read GetRadioFields;
property FieldValues[Name:String]:String read GetFieldValues write SetFieldValues;
property Method:String read FMethod write FMethod;
property ENCType:String read FENCType write FENCType;
property Action:String read FAction write FAction;
end;

THTMLElementCollection = class(THTMLCollection);
THTMLScriptCollection = class(THTMLCollection);
THTMLStyleCollection = class(THTMLCollection);
THTMLImageCollection = class(THTMLCollection);
THTMLAnchorCollection = class(THTMLCollection);
THTMLFieldCollection = class(THTMLCollection);
THTMLOptionCollection = class(THTMLCollection);
THTMLFormCollection = class(THTMLCollection)
public
function Find(Keys,Values:String):THTMLForm; overload;
function Find(Keys,Values:String; Match:TMatch):THTMLForm; overload;
function Find(Keys,Values:array of String; Match:TMatch):THTMLForm; overload;
function FindByField(Keys,Value:String):THTMLForm; overload;
function FindByField(Keys,Value:String; Match:TMatch):THTMLForm; overload;
function FindByField(Keys,Values:array of String; Match:TMatch):THTMLForm; overload;
function FindWithField(FormKeys,FormValues:String;
FieldKeys,FieldValues:String):THTMLForm; overload;
function FindWithField(FormKeys,FormValues:String; FormMatch:TMatch;
FieldKeys,FieldValues:String; FieldMatch:TMatch):THTMLForm; overload;
function FindWithField(FormKeys,FormValues:array of String; FormMatch:TMatch;
FieldKeys,FieldValues:array of String; FieldMatch:TMatch):THTMLForm; overload;
end;

TEvalEvent = function(Sender:TObject; Script:string):Variant of object;
TSubmitEvent = procedure(Sender:TObject; Return:Integer) of object;
TChangedEvent = procedure(Sender:TObject; Obj:THTMLElement; Operation:TOperation) of object;

THTMLDocument = class
private
FHTTP:THTTP;
FURL:String;
FTitle:String;
FSource:String;
FDisplay:String;
FRoot:THTMLElementCollection;
FAll:THTMLElementCollection;
FForms:THTMLFormCollection;
FFields:THTMLFieldCollection;
FAnchors:THTMLAnchorCollection;
FImages:THTMLImageCollection;
FStyles:THTMLStyleCollection;
FScripts:THTMLScriptCollection;
FFinds:THTMLElementCollection;
FOnEval: TEvalEvent;
FOnSubmit:TSubmitEvent;
FOnChanged:TChangedEvent;

function GetMetaRefresh: string;
public
constructor Create(AHTTP:THTTP);
destructor Destroy; override;
procedure Clear;
procedure Notify(Element: THTMLElement; Operation: TOperation); virtual;

function Find(Keys,Values:array of String):THTMLElement; overload;
function Find(Keys,Values:array of String; Match:TMatch):THTMLElement; overload;
function Find(Keys,Values:array of String; FindIn:TFindin):THTMLElement; overload;
function Find(Keys,Values:array of String; Parent:THTMLElement):THTMLElement; overload;
function Find(Keys,Values:array of String; Match:TMatch; FindIn:TFindin):THTMLElement; overload;
function Find(Keys,Values:array of String; Match:TMatch; Parent:THTMLElement):THTMLElement; overload;
function Find(Keys,Values:array of String; Match:TMatch; FindIn:TFindin; Parent:THTMLElement):THTMLElement; overload;
function Finds(Keys,Values:array of String):THTMLElementCollection; overload;
function Finds(Keys,Values:array of String; Match:TMatch):THTMLElementCollection; overload;
function Finds(Keys,Values:array of String; FindIn:TFindin):THTMLElementCollection; overload;
function Finds(Keys,Values:array of String; Parent:THTMLElement):THTMLElementCollection; overload;
function Finds(Keys,Values:array of String; Match:TMatch; FindIn:TFindin):THTMLElementCollection; overload;
function Finds(Keys,Values:array of String; Match:TMatch; Parent:THTMLElement):THTMLElementCollection; overload;
function Finds(Keys,Values:array of String; Match:TMatch; FindIn:TFindin; Parent:THTMLElement):THTMLElementCollection; overload;

property URL:String read FURL;
property Title:String read FTitle;
property Source:String read FSource;
property Display:String read FDisplay;
property All:THTMLElementCollection read FAll;
property Root:THTMLElementCollection read FRoot;
property Forms:THTMLFormCollection read FForms;
property Fields:THTMLFieldCollection read FFields;
property Anchors:THTMLAnchorCollection read FAnchors;
property Images:THTMLImageCollection read FImages;
property Styles:THTMLStyleCollection read FStyles;
property Scripts:THTMLScriptCollection read FScripts;
property MetaRefresh:string read GetMetaRefresh;
property OnEval:TEvalEvent read FOnEval write FOnEval;
property OnSubmit:TSubmitEvent read FOnSubmit write FOnSubmit;
property OnChanged:TChangedEvent read FOnChanged write FOnChanged;
end;

THTMLToken = class
private
FIndex: Integer;
FTokenList: TList;
function GetEOF:Boolean;
function GetCurrToken:String;
function GetNextToken:string;
function GetNextNextToken:string;
function GetPrevToken:String;
function GetToken(Index:Integer):String;
public
constructor Create;
destructor Destroy; override;

procedure Clear;
procedure Process(Source:String);

function IsSpace(Token:String):Boolean;
function MoveNext:Boolean;
function SkipSpace:Boolean;
function SkipToken(Token:String):Boolean;
function Preview(Tokens:array of String):Boolean;
function MatchToken(Token:String):Boolean; overload;
function MatchToken(Tokens:array of String):Boolean; overload;

property EOF:Boolean read GetEOF;
property CurrToken:String read GetCurrToken;
property NextToken:String read GetNextToken;
property NextNextToken:String read GetNextNextToken;
property PrevToken:String read GetPrevToken;
property Items[Index:Integer]: String read GetToken; default;
end;

TParseEvent = procedure(Sender:TObject) of object;

THTMLParser = class
type
TransferType = (ttNone,ttHTML,ttScript,ttStyle);
BlockType = (btNone,btRegExp,btString,btComments);
private
FOwner:THTMLParser;
FDocument:THTMLDocument;
FLastContent:String;
FBlock: BlockType;
FEndBlock: TList;
FTransferType:TransferType;
FTokenList:THTMLToken;
FStack:TStack;

procedure CheckBlock;
function GetValue:String;
function GetTransfer:String;
function GetInner(TagName:String):String;
function CheckName(Name:String):Boolean;

procedure Pop(T:TClass);
function Peek:THTMLElement;
procedure Push(Element:THTMLElement);
procedure AddContent(Text:String);
procedure Addobject(Element:THTMLElement);
procedure UpdateParent(Text:String; Element:THTMLElement);
function LabelInfo(TagName:String; var LabelClass:THTMLClass):TLabelEnum;

procedure Process(ASource:String);
procedure ProcessToken;
procedure ProcessLabel;
procedure ProcessCDATA;
procedure ProcessContent;
procedure ProcessComments;
procedure ProcessLabelOpen(Name:String);
procedure ProcessLabelClose(Name:String);
function ProcessLabelValue(Element:THTMLElement):Boolean;
public
constructor Create(ADocument:THTMLDocument; const AOwner:THTMLParser=nil);
destructor Destroy; override;
procedure Clear;
end;

THTMLEngine = class(THTMLDocument)
private
FCharset:String;
FParser:THTMLParser;
FRawCode:RawByteString;
FMetaRefreshEnable:Boolean;
FOnBeforeParse:TParseEvent;
FOnAfterParse:TParseEvent;

function GetCharset:String; overload;
procedure SetCharset(const Value: String);
public
constructor Create(AHTTP:THTTP);
destructor Destroy; override;

procedure Clear;
procedure Load(AUrl:string; ARawCode:RawByteString);
procedure InsertSource(AddCode:String);

function Translate(const Text:string):String;
function GetCharset(const RawCode:RawByteString):String; overload;
function Decoding(const RawCode:RawByteString):String; overload;
function Decoding(const RawCode:RawByteString; ACharSet:String):String; overload;
property Charset:String read FCharset write SetCharset;
property MetaRefreshEnable:Boolean read FMetaRefreshEnable write FMetaRefreshEnable;
property OnBeforeParse:TParseEvent read FOnBeforeParse write FOnBeforeParse;
property OnAfterParse:TParseEvent read FOnAfterParse write FOnAfterParse;
end;


Javascript模拟部分

type
TWebbotSupport = set of (wsHTTPS,wsHTML,wsScript);

TWebbotManage = class;
TWebbot = class
private
FHTTP:THTTP;
FDocument:THTMLEngine;
FCookies:TCookies;
FFreeCookies:Boolean;
FScript:TScriptEngine;
FOwner:TWebbotManage;
FSupport:TWebbotSupport;
procedure OnSubmit(Sender:TObject; Return:Integer);
public
constructor Create(const ASupport:TWebbotSupport=[wsHTTPS,wsHTML,wsScript]; const ACookies:TCookies=nil; const AOwner:TWebbotManage=nil);
destructor Destroy; override;
function Clone:TWebbot;
function Get(Url:string):Integer;
function Post(Url:string; PostData:AnsiString):Integer; overload;
function Post(Url:string; PostField:array of String):Integer; overload;

property HTTP:THTTP read FHTTP;
property Document:THTMLEngine read FDocument;
property Cookies:TCookies read FCookies;
property Script:TScriptEngine read FScript;
end;

TWebbotManage = class(THTTPManage)
private
FWebbotList:TList<TWebbot>;
FSupport:TWebbotSupport;
public
constructor Create(const ASupport:TWebbotSupport=[wsHTTPS,wsHTML,wsScript]); overload;
destructor Destroy; override;

procedure Notify(AObject: Tobject; Operation: TOperation); override;
function CreateWebbot(const Cookies:TCookies=nil):TWebbot;
procedure FreeWebbot(Browser:TWebbot);
end;
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: 
相关文章推荐