您的位置:首页 > 运维架构 > Nginx

Nginx支持反爬虫并限制客户端的请求的并发数

2018-03-03 13:16 435 查看
cat /usr/local/nginx/conf/agent_deny.conf

if ($http_user_agent ~* "qihoobot|Baiduspider|Googlebot|Googlebot-Mobile|Googlebot-Image|Mediapartners-Google|Adsbot-Google|Feedfetcher-Google|Yahoo! Slurp|Yahoo! Slurp China|YoudaoBot|Sosospider|Sogou spider|Sogou web spider|MSNBot|ia_archiver|Tomato Bot|Catall Spider|AcoiRobot") {
return 403;
}

if ($http_user_agent ~ "WinHttp|WebZIP|FetchURL|node-superagent|java/|FeedDemon|Jullo|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|Java|Feedly|Apache-HttpAsyncClient|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|oBot|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|HttpClient|MJ12bot|heritrix|EasouSpider|Ezooms|BOT/0.1|YandexBot|FlightDeckReports|Linguee Bot|iaskspider^$") {
return 403;
}

if ($request_method !~ ^(GET|HEAD|POST)$) {
return 403;
}

if ($http_user_agent ~* (Python|Java|Wget|Scrapy|Curl|HttpClient|Spider)) {
return 403;
}

#屏蔽单个IP的命令是
#deny 123.45.6.7
#封整个段即从123.0.0.1到123.255.255.254的命令
#deny 123.0.0.0/8
#封IP段即从123.45.0.1到123.45.255.254的命令
#deny 124.45.0.0/16
#封IP段即从123.45.6.1到123.45.6.254的命令是
#deny 123.45.6.0/24
以下IP皆为流氓
deny 58.95.66.0/24;

注释:

一般情况下是允许百度爬虫和谷歌爬虫来爬取网站的内容的,例如网站官网的首页等,所以百度的爬虫和谷歌的爬虫是可以放开,允许来爬取网站内容的。
此文件agent_deny.conf 包含到网站官网的server虚拟主机里面的。

以下的nginx配置文件是方向代理负载均衡的配置文件:

server {
listen       80;
server_name  pk.tltest.com static.tltest.com;
access_log   /home/wwwlogs/access.log  main;

## 这个就是反爬虫文件
include /usr/local/nginx/conf/agent_deny.conf;
location / {
limit_req zone=reqip burst=200 nodelay;
proxy_cache cache_one;
proxy_cache_valid  200 304 301 302 99s;
proxy_cache_valid any 1s;
proxy_redirect off;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header REMOTE-HOST $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Connection "";
proxy_http_version 1.1;
proxy_next_upstream off;
proxy_ignore_client_abort on;
proxy_ignore_headers Set-Cookie Cache-Control;
client_max_body_size 30m;
client_body_buffer_size 256k;
proxy_connect_timeout 75;
proxy_send_timeout 300;
proxy_read_timeout 300;
proxy_buffer_size 1m;
proxy_buffers 8 512k;
proxy_busy_buffers_size 2m;
proxy_temp_file_write_size 2m;
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503;
proxy_max_temp_file_size 128m;
proxy_pass http://backend; }

location *\.(php|python)$ {
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For  $remote_addr;
proxy_pass http://backend; }

####nginx前端限制客户端对网站某个目录的请求搜索的并发数
location = /novel/search {
limit_conn conip 2;
limit_req zone=reqip burst=3 nodelay;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For  $remote_addr;
proxy_pass http://backend; #access_log /home/wwwlogs/search.log  main;
}

####nginx前端限制客户端对网站某个目录的文件内容请求下载的并发数
location = /novel/read/cache {
limit_conn conip 1;
limit_req zone=reqip burst=2 nodelay;
limit_rate 512k;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For  $remote_addr;
proxy_pass http://backend; #access_log /home/wwwlogs/download.log  main;
}
####nginx前端限制客户端对网站某个目录的文件下apk下载的并发数
location = /novel/read/content {
limit_conn conip 5;
limit_req zone=reqip burst=10 nodelay;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For  $remote_addr;
proxy_pass http://backend; }

}

参考文档:
https://www.centos.bz/2018/01/nginx%E6%94%AF%E6%8C%81https%E5%B9%B6%E4%B8%94%E6%94%AF%E6%8C%81%E5%8F%8D%E7%88%AC%E8%99%AB/
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  反爬虫 限制 客户端