您的位置:首页 > 编程语言 > Python开发

python使用urllib模块和pyquery实现阿里巴巴排名查询

2014-01-16 14:45 826 查看
class ProxyScrapy(object):    def __init__(self):        self.proxy_robot = ProxyRobot()        self.current_proxy = None        self.cookie = cookielib.CookieJar()    def __builder_proxy_cookie_opener(self):                cookie_handler = urllib2.HTTPCookieProcessor(self.cookie)                handlers = [cookie_handler]

        if PROXY_ENABLE:            self.current_proxy = ip_port = self.proxy_robot.get_random_proxy()            proxy_handler = urllib2.ProxyHandler({'http': ip_port[7:]})            handlers.append(proxy_handler)        opener = urllib2.build_opener(*handlers)        urllib2.install_opener(opener)        return opener

    def get_html_body(self,url):        opener = self.__builder_proxy_cookie_opener()

        request=urllib2.Request(url)        #request.add_header("Accept-Encoding", "gzip,deflate,sdch")        #request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")        #request.add_header("Cache-Control", "no-cache")        #request.add_header("Connection", "keep-alive")

        try:            response = opener.open(request,timeout=2)

            http_code = response.getcode()            if http_code == 200:                if PROXY_ENABLE:                    self.proxy_robot.handle_success_proxy(self.current_proxy)                html = response.read()                return html            else:                if PROXY_ENABLE:                    self.proxy_robot.handle_double_proxy(self.current_proxy)                return self.get_html_body(url)        except Exception as inst:            print inst,self.current_proxy            self.proxy_robot.handle_double_proxy(self.current_proxy)            return self.get_html_body(url)

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息