您的位置:首页 > 理论基础 > 计算机网络

[转]利用Winsock实现HTTP的GET请求

2013-11-13 16:25 162 查看
网络爬虫需要从指定的URL通过HTTP协议来获得HTML文件信息,以此从一个URL爬到另一个URL。在Windows平台上,这往往通过WinINet接口实现。

  但是,如果对HTTP协议熟悉的话,也可以通过Winsock接口实现。代码如下。

1 #pragma warning (disable:4996)

2

3 #define DEFAULT_URL "http://www.google.com"

4

5 BOOL WinsockStartup(BYTE highVer, BYTE lowVer)

6 {

7 WSADATA wsaData;

8 return WSAStartup(MAKEWORD(highVer, lowVer), &wsaData) == 0;

9 }

10

11 int SendData(SOCKET s, char * data)

12 {

13 return send(s, data, strlen(data), 0);

14 }

15

16 void ParseTheURL(char * pszURL, char * pszHostName)

17 {

18 char * p, * pHostStart;

19

20 p = strstr(pszURL, "http://");

21 if (p && p == pszURL)

22 {

23 pHostStart = pszURL + 7;

24 }

25 else

26 {

27 pHostStart = pszURL;

28 }

29 p = strchr(pHostStart, '/');

30 if (p)

31 {

32 memcpy(pszHostName, pHostStart, p - pHostStart);

33 }

34 else

35 {

36 memcpy(pszHostName, pHostStart, strlen(pHostStart));

37 }

38 }

39

40 int _tmain()

41 {

42 int iRet = 0;

43 DWORD dwError = 0;

44 BOOL bOk = FALSE;

45

46 char szURL[256] = { 0 }; // 主机文件,即URL

47 char szHostName[256] = { 0 }; // 主机名

48 char szPortName[] = "80"; // 端口号

49

50 if (!WinsockStartup(2, 2))

51 {

52 _tcprintf(TEXT("初始化Windows Sockets失败!"));

53 cin.getline(szURL, 255);

54 return -1;

55 }

56

57 addrinfo aiHints = { 0 };

58 addrinfo * aiList;

59

60 aiHints.ai_family = AF_INET;

61 aiHints.ai_socktype = SOCK_STREAM;

62 aiHints.ai_protocol = IPPROTO_TCP;

63

64 cout<<"输入URL:";

65 cin.getline(szURL, 255);

66

67 if (strcmp(szURL, "") == 0)

68 {

69 strcpy(szURL, DEFAULT_URL);

70 cout<<DEFAULT_URL<<endl;

71 }

72

73 ParseTheURL(szURL, szHostName);

74

75 if (getaddrinfo(szHostName, szPortName, NULL, &aiList) != 0)

76 {

77 _tcprintf_s(TEXT("getaddrinfo失败:%d"), WSAGetLastError());

78 WSACleanup();

79 cin.getline(szURL, 255);

80 return -1;

81 }

82

83 SOCKET s;

84 for (addrinfo * aiPtr = aiList; aiPtr != NULL; aiPtr = aiPtr->ai_next)

85 {

86 s = socket(aiList->ai_family, aiList->ai_socktype, aiList->ai_protocol);

87 if (s == INVALID_SOCKET)

88 {

89 _tcprintf_s(TEXT("socket创建失败:%d"), WSAGetLastError());

90 continue;

91 }

92

93 if (connect(s, aiPtr->ai_addr, aiPtr->ai_addrlen) == SOCKET_ERROR)

94 {

95 closesocket(s);

96 s = INVALID_SOCKET;

97 _tcprintf_s(TEXT("connect失败:%d"), WSAGetLastError());

98 continue;

99 }

100 break;

101 }

102

103 freeaddrinfo(aiList);

104

105 if (s == INVALID_SOCKET)

106 {

107 WSACleanup();

108 cin.getline(szURL, 255);

109 return -1;

110 }

111

112 char requestData[512] = { 0 };

113 sprintf(requestData, "GET %s HTTP/1.1\r\n", szURL);

114 SendData(s, requestData);

115 //SendData(s, "GET / HTTP/1.1\r\n");

116 sprintf(requestData, "Host:%s\r\n", szHostName);

117 SendData(s, requestData);

118 SendData(s, "Accept: */*\r\n");

119 SendData(s, "User-Agent: Mozilla/4.0(compatible; MSIE 5.00; Windows NT)\r\n");

120 SendData(s, "Connection:Close\r\n");

121 //SendData(s, "Connection:Keep-Alive\r\n");

122 SendData(s, "\r\n");

123 SendData(s, "\r\n");//最后要加空行

124

125 BOOL done = FALSE;

126 char buffer[1024] = { 0 };

127 int l, chars = 0;

128

129 // 打印http响应的头部

130 while (!done)

131 {

132 l = recv(s, buffer, 1, 0);

133 if (l <= 0)

134 done = TRUE;

135 switch(*buffer)

136 {

137 case '\r':

138 break;

139 case '\n':

140 if(chars == 0)

141 done = TRUE;

142 chars = 0; // 表示另起一行

143 break;

144 default:

145 ++chars;

146 break;

147 }

148 printf("%c",*buffer);

149 }

150

151 // 接收正文部分

152 int sum = 0;

153 do

154 {

155 l = recv(s, buffer, sizeof (buffer) - 1, 0);

156 if( l <= 0 )

157 break;

158 sum += l;

159 *(buffer + l) = 0;

160 printf(buffer);

161 } while( l > 0 );

162

163 //这里输出正文部分大小,发现其实和响应消息头部的Content-length大小是一样的

164 //这样就可以检查是否接受完毕

165 printf("\n\n大小 = %d字节\n",sum);

166

167 WSACleanup();

168

169 cin.getline(szURL, 255);

170 return 0;

171 }
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: