您的位置:首页 > 编程语言 > Python开发

Python爬虫学习笔记——Python基础

2018-11-01 21:52 344 查看

Python爬虫学习笔记——Python基础

1 IO编程

1.1 文件读写

Python内置了读写文件的函数,语法为:
open(name[.mode[.buffering]])

#打开文件
f = open(r'C:\text\myTextFile.txt')
#读取文件
with open(r'C:\text\myTextFile.txt', 'r') as fileReader:
print fileReader.read()
#按每次读取一行内容
for line in fileReader.readlines():
print line.strip()
#写入文件
with open(r'C:\text\myTextFile.txt', 'w') as fileReader:
fileReader.write('myTextFile')

1.2 操作文件和目录

import os
import shutil
#获取当前python脚本工作的目录路径
os.getcwd()
#返回指定目录的所有文件和目录名
os.listdir()
#删除文件
os.remove(filepath)
#删除多个空目录
os.removedirs(r'D:\python')
#检验给出的路径是否是一个文件
os.path.isfile(filepath)
#检验给出的路径是否是一个目录
os.path.isdir(filepath)
#判断是否是绝对路径
os.path.isabs()
#检验路径是否存在
os.path.exists(r'D:\python')
#分离一个路径的目录名和文件名
os.path.split()
#分离扩展名
os.path.splitext()
#获取路径名
os.path.dirname(filepath)
#获取文件名
os.path.basename(filepath)
#读取和设置环境变量
os.getenv()
os.putenv()
#给出当前平台使用的行终止符
os.linesep #'\r\n' in Windows, '\n' in Linux
#指示正在使用的平台
os.name
#重命名文件或者目录
os.rename(old, new)
#创建多级目录
os.makedirs(r'C:\python\test')
#创建单个目录
os.mkdir('test')
#获取文件属性
os.stat(file)
#修改文件权限与时间戳
os.chmod(file)
#获取文件大小
os.path.getsize(filename)
#复制文件夹
shutil.copytree('olddir', 'newdir')
#复制文件
shutil.copyfile('oldfile', 'newfile')
#移动文件
shutil.move('oldpos', 'newpos')
#删除目录
os.rmdir('dir') #只能删除空目录
shutil.rmtree('dir') #可以删除空目录

1.3 序列化操作(把内存的变量变成可存储或传输的过程)

import cPickle as pickle
d = dict(url='index.html', title='home page', content ='home page')
pickle.dumps(d)
f = open(r'D:\dump.txt', 'wb')
pickle.dump(d, f)
f.close()

#反序列化
f = open(r'D:\dump.txt', 'rb')
d = pickle.load(f)
f.close()
d

2 进程和线程

进程是程序在计算机上的执行活动,当运行一个程序是,就启动一个进程。在Windows系统中,进程被细化为线程,作为可以独立运行的单位。多进程,也就是说同一个系统中允许多个进程处于运行状态,也称为多任务。在单CPU里实现多进程,需要使用并发技术。

2.1 多进程

使用os模块的fork方法实现多进程,fork方法调用一次,返回两次,操作系统将当前父进程复制出一份子进程,父进程返回子进程的ID,而子进程永远返回0。

import os
if __name__ == '__main__':
print('current process %s start ... ' %(os.getpid()))
pid = os.fork()
if pid < 0:
print('error in fork')
elif pid == 0:
print('I am child process %s and my parent process is %s' %(os.getpid(), os.getppid()))
else:
print('I %s created a child process %s.' %(os.getpid(), pid))

使用multiprocessing模块实现多进程

import os
from multiprocessing import Process
def run_proc(name):
print('child process %s (%s) running...' %(name, os.getpid()))
if __name__ == '__main__':
print('parent process %s' %os.getpid())
for i in range(5):
p = Process(target=run_proc, args=(str(i), ))
print('Process will start.')
p.start()
p.join()
print('Process end.')

使用multiprocessing模块的Pool类来创建多进程

from multiprocessing import Pool
import os, time, random
def run_task(name):
print('Task %s (pid=%s) is running...' %(name, os.getpid()))
time.sleep(random.random()*3)
print('Task %s end.' %name)
if __name__ == '__main__':
print('current process %s' %os.getpid())
p = Pool(processes=3)
for i in range(5):
p.apply_async(run_task, args=(i, ))
print('waiting for all subprocesses done...')
p.close()
p.join()
print('all subprocesses done.')

进程间通信,使用Queue方式完成进程间通信。

from multiprocessing import Process, Queue
import os, time, random

#写数据进程执行的代码
def proc_write(q, urls):
print('Process(%s) is writing...' %os.getpid())
for url in urls:
q.put(url)
print('Put %s to queue...' %url)
time.sleep(random.random())
#读数据进程执行的代码
def proc_read(q):
print('Process(%s) is reading...' %os.getpid())
while True:
url = q.get(True)
print('Get %s from queue.' %url)

if __name__ == '__main__':
#父进程创建Queue,并传给各个子进程
q = Queue()
proc_writer1 = Process(target=proc_write, args=(q, ['url_1', 'url_2', 'url_3']))
proc_writer2 = Process(target=proc_write, args=(q, ['url_4', 'url_5', 'url_6']))
proc_reader = Process(target=proc_read, args=(q, ))
#启动子进程proc_write,写入
proc_writer1.start()
proc_writer2.start()
#启动子进程proc_read,读取
proc_reader.start()
#等待proc_writer结束
proc_writer1.join()
proc_writer2.join()
#proc_reader进程里是死循环,无法等待期结束,只能强行终止
proc_reader.terminate()

使用Pipe方式完成进程间通信

import multiprocessing
import random
import time, os

def proc_send(pipe, urls):
for url in urls:
print('Process(%s) send: %s' %(os.getpid(), url))
pipe.send(url)
time.sleep(random.random())
def proc_recv(pipe):
while True:
print('Process(%s) rev:%s' %(os.getpid(), pipe.recv()))
time.sleep(random.random())
if __name__ == '__main__':
pipe = multiprocessing.Pipe()
p1 = multiprocessing.Process(target=proc_send, args=(pipe[0], ['url_'+str(i) for i in range(10)]))
p2 = multiprocessing.Process(target=proc_recv, args=(pipe[1], ))
p1.start()
p2.start()
p1.join()
p2.join()

2.2 多线程

用threading模块创建多线程,第一种方式,把一个函数传入并创建Thread实例,然后调用start方法开始执行。

import random
import time, threading
#新线程执行的代码
def thread_run(urls):
print('Current %s is running...' % threading.current_thread().name)
for url in urls:
print('%s ---->>> %s' % (threading.current_thread().name, url))
time.sleep(random.random())
print('%s ended.' % threading.current_thread().name)
print('%s is running...' % threading.current_thread().name)
t1 = threading.Thread(target=thread_run, name='Thread_1', args=(['url_1', 'url_2', 'url_3'],))
t2 = threading.Thread(target=thread_run, name='Thread_2', args=(['url_4', 'url_5', 'url_6'],))
t1.start()
t2.start()
t1.join()
t2.join()
print('%s ended.' % threading.current_thread().name)

第二种方式直接从threading.Thread继承并创建线程类,然后重写__init__方法和run方法。

import random
import threading
import time
class myThread(threading.Thread):
def __init__(self, name, urls):
threading.Thread.__init__(self, name=name)
self.urls = urls

def run(self):
print('Current %s is running...' % threading.current_thread().name)
for url in self.urls:
print('%s ---->>> %s' % ((threading.current_thread().name), url))
time.sleep(random.random())
print('%s ended.' % threading.current_thread().name)
print('%s is running...' % threading.current_thread().name)
t1 = myThread(name='Thread_1', urls=['url_1', 'url_2', 'url_3'])
t2 = myThread(name='Thread_2', urls=['url_4', 'url_5', 'url_6'])
t1.start()
t2.start()
t1.join()
t2.join()
print('%s ended.' % threading.current_thread().name)

线程同步

import threading
mylock = threading.RLock()
num = 0
class myThread(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self, name=name)

def run(self):
global num
while True:
mylock.acquire()
print('%s locked, Number: %d' % (threading.current_thread().name, num))
if num>=4:
mylock.release()
print('%s released, Number: %d' % (threading.current_thread().name, num))
break
num += 1
print('%s released, Number: %d' % (threading.current_thread().name, num))
mylock.release()
if __name__ == '__main__':
thread1 = myThread('Thread_1')
thread2 = myThread('Thread_2')
thread1.start()
thread2.start()

2.3 协程

协程,是一种轻量级线程,gevent是一个基于协程的Python网络函数库,比较完善地提供了协程的支持。

from gevent import monkey;monkey.patch_all()
import gevent
import urllib2

def run_task(url):
print('Visit --> %s' % url)
try:
response = urllib2.urlopen(url)
data = response.read()
print('%d bytes received from %s.' % (len(data), url))
except Exception as e:
print(e)

if __name__ == '__main__':
urls = ['https://github.com/', 'https://www.python.org/', 'https://www.cnblogs.com/']
greenlets = [gevent.spawn(run_task, url) for url in urls]
gevent.joinall(greenlets)

使用gevent中的pool对象,对动态数量的greenlet进行并发管理

from gevent import monkey
monkey.patch_all()
import urllib2
from gevent.pool import Pool

def run_task(url):
print('Visit --> %s' % url)
try:
response = urllib2.urlopen(url)
data = response.read()
print('%d bytes received from %s.' % (len(data), url))
except Exception as e:
print(e)
return 'rul:%s --->finish'% url

if __name__ == '__main__':
pool = Pool(2)
urls = ['https://github.com/', 'https://www.python.org/', 'http://www.cnblogs.com/']
results = pool.map(run_task,urls)
print(results)

2.4 分布式进程

分布式进程指的是将Process进程分布到多台机器上。
首先创建服务进程:

import Queue
from multiprocessing.managers import baseManager
from mulitprocessing import freeze_support

#Task number and define receive/send queue
task_number = 10
task_queue = Queue.Queue(task_number)
result_queue = Queue.Queue(task_number)

def get_task():
return task_queue

def get_result():
return result_queue

#Create QueueManager class
class QueueManager(BaseManager):
pass
def win_run():
QueueManager.register('get_task_queue', callable = get_task)
QueueManager.register('get_result_queue', callable = get_result)

#Bind port and setup validation token
manager = QueueManager(address = ('127.0.0.1',8001), authkey = 'enterprise')

#Initiate
manager.start()
try:
#Access task queue and result queue through network
task = manager.get_task_queue()
result = manager.get_result_queue()

#Add task
for url in ['ImageUrl_'+str(i) for i in range(10)]:
print('Put task %s ...' % url)
task.put(url)
print('try get result')
for i in range(10):
print('result is %s ' % result.get(timeout=10))
except:
pritn('Manager error')
finally:
#Must close, or there will be an error
manager.shutdown()
if __name__ == '__main__':
freeze_support()
win_run()

其次,创建任务进程:

improt time
from multiprocessing.managers import BaseManager

#Create QueueManager
class QueueManager(BaseManager):
pass

#Use QueueManager to register
QueueManager.register('get_task_queue')
QueueManager.register('get_result_queue')

#Connect to the server
server_addr = '127.0.0.1'
print('Connect to server %s...' % server_addr)

#Port and validation token should be the same as the taskManager
m = QueueManager(address=(server_addr, 8001), authkey='enterprise')

m.connect()

#Access Queue object
task = m.get_task_queue()
result = m.get_result_queue()

#Access task from the queue and write into the result queue
while(not task.empty()):
image_url = task.get(True, timeout=5)
print('run task download %s...' % image_url)
time.sleep(1)
result.put('%s--->success' % image_url)

print('worker exit.')

3 网络编程

Socket是网络编程的一个抽象概念,Python提供了两个基本Socket模块,分别是Socket和SocketServer。

3.1 TCP编程

TCP是一种面向连接的通信方式。下面演示创建TCP服务端:

import socket
import threading
import time

def dealClient(sock, addr):
#Receive data and send out
print('Accept new connection from %s:%s...' % addr)
sock.send(b'Hello, I am server')
while True:
data = sock.recv(1024)
time.sleep(1)
if not data or data.decode('utf-8') == 'exit':
break
print('--->>%s!' % data.decode('utf-8'))
sock.send(('Loop_Msg: %s!' % data.decode('utf-8')).encode('utf-8'))

#Close Socket
sock.close()
print('Connection from %s:%s closed.' % addr)

if __name__ == '__main__':
#Create a Socket based on IPv4 and TCP protocol
#Socket binds IP(127.0.0.1) and port
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(('127.0.0.1', 9999))

#Listen
s.listen(5)
print('Waiting for connectoin...')
while True:
#Receive a new connection
sock, addr = s.accept()
#Create a new thread to deal with TCP connection
t = threading.Thread(target=dealClient, args=(sock, addr))
t.start()

TCP客户端:

import socket
#Initialize Socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
#Connect to the target IP and port
s.connect(('127.0.0.1', 9999))
#Receive message
print('--->>'+s.recv(1024).decode('utf-8'))
#Send message
s.send(b'Hello, I am a client')
print('--->>'+s.recv(1024).decode('utf-8'))
s.snend(b'exit')
#Close Socket
s.close()

3.2 UDP编程

UDP是面向无连接的协议,只需要知道对方的IP地址和端口号,就可以直接发数据报。先创建服务端:

import socket

s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.bind(('127.0.0.1', 9999))
print('Bind UDP on 9999...')
while True:
data, addr = s.recvfrom(1024)
print('Received from %s:%s.' % addr)
s.sendto(b'Hello, %s!' % data, addr)

再创建客户端

import socket

s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
for data in [b'Hello', b'World']:
s.sendto(data, ('127.0.0.1', 9999))
print(s.recv(1024).decode('utf-8'))
s.close()
阅读更多
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: