Python爬虫学习笔记——Python基础
2018-11-01 21:52
344 查看
Python爬虫学习笔记——Python基础
1 IO编程
1.1 文件读写
Python内置了读写文件的函数,语法为:
open(name[.mode[.buffering]])
#打开文件 f = open(r'C:\text\myTextFile.txt') #读取文件 with open(r'C:\text\myTextFile.txt', 'r') as fileReader: print fileReader.read() #按每次读取一行内容 for line in fileReader.readlines(): print line.strip() #写入文件 with open(r'C:\text\myTextFile.txt', 'w') as fileReader: fileReader.write('myTextFile')
1.2 操作文件和目录
import os import shutil #获取当前python脚本工作的目录路径 os.getcwd() #返回指定目录的所有文件和目录名 os.listdir() #删除文件 os.remove(filepath) #删除多个空目录 os.removedirs(r'D:\python') #检验给出的路径是否是一个文件 os.path.isfile(filepath) #检验给出的路径是否是一个目录 os.path.isdir(filepath) #判断是否是绝对路径 os.path.isabs() #检验路径是否存在 os.path.exists(r'D:\python') #分离一个路径的目录名和文件名 os.path.split() #分离扩展名 os.path.splitext() #获取路径名 os.path.dirname(filepath) #获取文件名 os.path.basename(filepath) #读取和设置环境变量 os.getenv() os.putenv() #给出当前平台使用的行终止符 os.linesep #'\r\n' in Windows, '\n' in Linux #指示正在使用的平台 os.name #重命名文件或者目录 os.rename(old, new) #创建多级目录 os.makedirs(r'C:\python\test') #创建单个目录 os.mkdir('test') #获取文件属性 os.stat(file) #修改文件权限与时间戳 os.chmod(file) #获取文件大小 os.path.getsize(filename) #复制文件夹 shutil.copytree('olddir', 'newdir') #复制文件 shutil.copyfile('oldfile', 'newfile') #移动文件 shutil.move('oldpos', 'newpos') #删除目录 os.rmdir('dir') #只能删除空目录 shutil.rmtree('dir') #可以删除空目录
1.3 序列化操作(把内存的变量变成可存储或传输的过程)
import cPickle as pickle d = dict(url='index.html', title='home page', content ='home page') pickle.dumps(d) f = open(r'D:\dump.txt', 'wb') pickle.dump(d, f) f.close() #反序列化 f = open(r'D:\dump.txt', 'rb') d = pickle.load(f) f.close() d
2 进程和线程
进程是程序在计算机上的执行活动,当运行一个程序是,就启动一个进程。在Windows系统中,进程被细化为线程,作为可以独立运行的单位。多进程,也就是说同一个系统中允许多个进程处于运行状态,也称为多任务。在单CPU里实现多进程,需要使用并发技术。
2.1 多进程
使用os模块的fork方法实现多进程,fork方法调用一次,返回两次,操作系统将当前父进程复制出一份子进程,父进程返回子进程的ID,而子进程永远返回0。
import os if __name__ == '__main__': print('current process %s start ... ' %(os.getpid())) pid = os.fork() if pid < 0: print('error in fork') elif pid == 0: print('I am child process %s and my parent process is %s' %(os.getpid(), os.getppid())) else: print('I %s created a child process %s.' %(os.getpid(), pid))
使用multiprocessing模块实现多进程
import os from multiprocessing import Process def run_proc(name): print('child process %s (%s) running...' %(name, os.getpid())) if __name__ == '__main__': print('parent process %s' %os.getpid()) for i in range(5): p = Process(target=run_proc, args=(str(i), )) print('Process will start.') p.start() p.join() print('Process end.')
使用multiprocessing模块的Pool类来创建多进程
from multiprocessing import Pool import os, time, random def run_task(name): print('Task %s (pid=%s) is running...' %(name, os.getpid())) time.sleep(random.random()*3) print('Task %s end.' %name) if __name__ == '__main__': print('current process %s' %os.getpid()) p = Pool(processes=3) for i in range(5): p.apply_async(run_task, args=(i, )) print('waiting for all subprocesses done...') p.close() p.join() print('all subprocesses done.')
进程间通信,使用Queue方式完成进程间通信。
from multiprocessing import Process, Queue import os, time, random #写数据进程执行的代码 def proc_write(q, urls): print('Process(%s) is writing...' %os.getpid()) for url in urls: q.put(url) print('Put %s to queue...' %url) time.sleep(random.random()) #读数据进程执行的代码 def proc_read(q): print('Process(%s) is reading...' %os.getpid()) while True: url = q.get(True) print('Get %s from queue.' %url) if __name__ == '__main__': #父进程创建Queue,并传给各个子进程 q = Queue() proc_writer1 = Process(target=proc_write, args=(q, ['url_1', 'url_2', 'url_3'])) proc_writer2 = Process(target=proc_write, args=(q, ['url_4', 'url_5', 'url_6'])) proc_reader = Process(target=proc_read, args=(q, )) #启动子进程proc_write,写入 proc_writer1.start() proc_writer2.start() #启动子进程proc_read,读取 proc_reader.start() #等待proc_writer结束 proc_writer1.join() proc_writer2.join() #proc_reader进程里是死循环,无法等待期结束,只能强行终止 proc_reader.terminate()
使用Pipe方式完成进程间通信
import multiprocessing import random import time, os def proc_send(pipe, urls): for url in urls: print('Process(%s) send: %s' %(os.getpid(), url)) pipe.send(url) time.sleep(random.random()) def proc_recv(pipe): while True: print('Process(%s) rev:%s' %(os.getpid(), pipe.recv())) time.sleep(random.random()) if __name__ == '__main__': pipe = multiprocessing.Pipe() p1 = multiprocessing.Process(target=proc_send, args=(pipe[0], ['url_'+str(i) for i in range(10)])) p2 = multiprocessing.Process(target=proc_recv, args=(pipe[1], )) p1.start() p2.start() p1.join() p2.join()
2.2 多线程
用threading模块创建多线程,第一种方式,把一个函数传入并创建Thread实例,然后调用start方法开始执行。
import random import time, threading #新线程执行的代码 def thread_run(urls): print('Current %s is running...' % threading.current_thread().name) for url in urls: print('%s ---->>> %s' % (threading.current_thread().name, url)) time.sleep(random.random()) print('%s ended.' % threading.current_thread().name) print('%s is running...' % threading.current_thread().name) t1 = threading.Thread(target=thread_run, name='Thread_1', args=(['url_1', 'url_2', 'url_3'],)) t2 = threading.Thread(target=thread_run, name='Thread_2', args=(['url_4', 'url_5', 'url_6'],)) t1.start() t2.start() t1.join() t2.join() print('%s ended.' % threading.current_thread().name)
第二种方式直接从threading.Thread继承并创建线程类,然后重写__init__方法和run方法。
import random import threading import time class myThread(threading.Thread): def __init__(self, name, urls): threading.Thread.__init__(self, name=name) self.urls = urls def run(self): print('Current %s is running...' % threading.current_thread().name) for url in self.urls: print('%s ---->>> %s' % ((threading.current_thread().name), url)) time.sleep(random.random()) print('%s ended.' % threading.current_thread().name) print('%s is running...' % threading.current_thread().name) t1 = myThread(name='Thread_1', urls=['url_1', 'url_2', 'url_3']) t2 = myThread(name='Thread_2', urls=['url_4', 'url_5', 'url_6']) t1.start() t2.start() t1.join() t2.join() print('%s ended.' % threading.current_thread().name)
线程同步
import threading mylock = threading.RLock() num = 0 class myThread(threading.Thread): def __init__(self, name): threading.Thread.__init__(self, name=name) def run(self): global num while True: mylock.acquire() print('%s locked, Number: %d' % (threading.current_thread().name, num)) if num>=4: mylock.release() print('%s released, Number: %d' % (threading.current_thread().name, num)) break num += 1 print('%s released, Number: %d' % (threading.current_thread().name, num)) mylock.release() if __name__ == '__main__': thread1 = myThread('Thread_1') thread2 = myThread('Thread_2') thread1.start() thread2.start()
2.3 协程
协程,是一种轻量级线程,gevent是一个基于协程的Python网络函数库,比较完善地提供了协程的支持。
from gevent import monkey;monkey.patch_all() import gevent import urllib2 def run_task(url): print('Visit --> %s' % url) try: response = urllib2.urlopen(url) data = response.read() print('%d bytes received from %s.' % (len(data), url)) except Exception as e: print(e) if __name__ == '__main__': urls = ['https://github.com/', 'https://www.python.org/', 'https://www.cnblogs.com/'] greenlets = [gevent.spawn(run_task, url) for url in urls] gevent.joinall(greenlets)
使用gevent中的pool对象,对动态数量的greenlet进行并发管理
from gevent import monkey monkey.patch_all() import urllib2 from gevent.pool import Pool def run_task(url): print('Visit --> %s' % url) try: response = urllib2.urlopen(url) data = response.read() print('%d bytes received from %s.' % (len(data), url)) except Exception as e: print(e) return 'rul:%s --->finish'% url if __name__ == '__main__': pool = Pool(2) urls = ['https://github.com/', 'https://www.python.org/', 'http://www.cnblogs.com/'] results = pool.map(run_task,urls) print(results)
2.4 分布式进程
分布式进程指的是将Process进程分布到多台机器上。
首先创建服务进程:
import Queue from multiprocessing.managers import baseManager from mulitprocessing import freeze_support #Task number and define receive/send queue task_number = 10 task_queue = Queue.Queue(task_number) result_queue = Queue.Queue(task_number) def get_task(): return task_queue def get_result(): return result_queue #Create QueueManager class class QueueManager(BaseManager): pass def win_run(): QueueManager.register('get_task_queue', callable = get_task) QueueManager.register('get_result_queue', callable = get_result) #Bind port and setup validation token manager = QueueManager(address = ('127.0.0.1',8001), authkey = 'enterprise') #Initiate manager.start() try: #Access task queue and result queue through network task = manager.get_task_queue() result = manager.get_result_queue() #Add task for url in ['ImageUrl_'+str(i) for i in range(10)]: print('Put task %s ...' % url) task.put(url) print('try get result') for i in range(10): print('result is %s ' % result.get(timeout=10)) except: pritn('Manager error') finally: #Must close, or there will be an error manager.shutdown() if __name__ == '__main__': freeze_support() win_run()
其次,创建任务进程:
improt time from multiprocessing.managers import BaseManager #Create QueueManager class QueueManager(BaseManager): pass #Use QueueManager to register QueueManager.register('get_task_queue') QueueManager.register('get_result_queue') #Connect to the server server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) #Port and validation token should be the same as the taskManager m = QueueManager(address=(server_addr, 8001), authkey='enterprise') m.connect() #Access Queue object task = m.get_task_queue() result = m.get_result_queue() #Access task from the queue and write into the result queue while(not task.empty()): image_url = task.get(True, timeout=5) print('run task download %s...' % image_url) time.sleep(1) result.put('%s--->success' % image_url) print('worker exit.')
3 网络编程
Socket是网络编程的一个抽象概念,Python提供了两个基本Socket模块,分别是Socket和SocketServer。
3.1 TCP编程
TCP是一种面向连接的通信方式。下面演示创建TCP服务端:
import socket import threading import time def dealClient(sock, addr): #Receive data and send out print('Accept new connection from %s:%s...' % addr) sock.send(b'Hello, I am server') while True: data = sock.recv(1024) time.sleep(1) if not data or data.decode('utf-8') == 'exit': break print('--->>%s!' % data.decode('utf-8')) sock.send(('Loop_Msg: %s!' % data.decode('utf-8')).encode('utf-8')) #Close Socket sock.close() print('Connection from %s:%s closed.' % addr) if __name__ == '__main__': #Create a Socket based on IPv4 and TCP protocol #Socket binds IP(127.0.0.1) and port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('127.0.0.1', 9999)) #Listen s.listen(5) print('Waiting for connectoin...') while True: #Receive a new connection sock, addr = s.accept() #Create a new thread to deal with TCP connection t = threading.Thread(target=dealClient, args=(sock, addr)) t.start()
TCP客户端:
import socket #Initialize Socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #Connect to the target IP and port s.connect(('127.0.0.1', 9999)) #Receive message print('--->>'+s.recv(1024).decode('utf-8')) #Send message s.send(b'Hello, I am a client') print('--->>'+s.recv(1024).decode('utf-8')) s.snend(b'exit') #Close Socket s.close()
3.2 UDP编程
UDP是面向无连接的协议,只需要知道对方的IP地址和端口号,就可以直接发数据报。先创建服务端:
import socket s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.bind(('127.0.0.1', 9999)) print('Bind UDP on 9999...') while True: data, addr = s.recvfrom(1024) print('Received from %s:%s.' % addr) s.sendto(b'Hello, %s!' % data, addr)
再创建客户端
import socket s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) for data in [b'Hello', b'World']: s.sendto(data, ('127.0.0.1', 9999)) print(s.recv(1024).decode('utf-8')) s.close()阅读更多
相关文章推荐
- Python学习笔记(八)爬虫基础(正则和编解码)
- 【python学习笔记】7:用python实现爬虫-基础
- python学习笔记(3)--爬虫基础教程1
- Python爬虫学习笔记之基础知识
- Python基础学习笔记
- Python学习笔记 02 Python基础
- python基础教程_学习笔记1:序列-1
- Python学习笔记---基础汇总部分
- python 学习笔记(-) 基础开端
- python基础教程_学习笔记5:字符串
- python基础教程_学习笔记18:标准库:一些最爱——shelve
- python基础学习笔记(二)
- Python学习笔记(十五):类基础
- python基础教程_学习笔记8:序列_练习与总结_1
- python基础教程_学习笔记15:标准库:一些最爱——fileinput
- python基础教程_学习笔记21:文件和素材
- Python核心编程(第二版)学习笔记02 【第3章 Python基础】
- Python学习笔记整理(十四)类基础
- Python基础教程学习笔记----第四章 字典
- python基础学习笔记(四)