Proxy_Spider

西刺免费代理IP爬取

目标URL:http://www.xicidaili.com/wt
1、本次使用的库为Selector,调用Xpath,个人认为是比较方便的,若使用pip安装失败,可以到库文件下载再使用pip安装,这样会省事,因为直接使用库文件安装会省去本地解析环境这个过程
2、分析目标网站源码,获取要爬取的数据的定位
image.png
不难发现,我们要的协议、ip、端口均在td标签里面
分别是第2、3、6
3、使用Selector库超级方便,selector.xpath即可定位到要获取的数据,再使用/text() #文字形式 extract() #原始数据
自己写的脚本,大牛勿喷

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#  _*_coding:utf-8_*_
import requests
import re
from scrapy.selector import Selector

def get_all_ip():

headers = {
'Upgrade-Insecure-Requests': '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/59.0.3071.115 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.xicidaili.com/nt/',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}

for i in range(3,8):
url = 'http://www.xicidaili.com/nt/%d' % i
resp = requests.get(url=url,headers=headers)
selector = Selector(text=resp.text)
all_info = selector.xpath('//*[@id="ip_list"]//tr')
#print(all_info)

ip_lists = []
for ip in all_info[1:]:
speed = ip.xpath('td[7]/div/@title').extract()[0]
if speed:
speed_str = float(speed.split('秒')[0])
ip_li = ip.xpath('td[2]/text()').extract()[0]
type_str = ip.xpath('td[6]/text()').extract()[0].lower() #大写转小写
port_str = ip.xpath('td[3]/text()').extract()[0]
# ip_lists.append((type_str,ip_li,port_str))
# print(ip_lists)
try:
proxies = {type_str: type_str + '://' + ip_li + ':' +port_str}
req = requests.get('http://httpbin.org/ip',headers=headers,proxies=proxies,timeout=2)
except:
#print('invalid ip and port')
#return False
pass
else:
if ((200 <= req.status_code) and (req.status_code < 300)):
#print(req.text)
ip_lists.append((type_str,ip_li,port_str))
else:
pass
#print(ip_lists)
return ip_lists

if __name__ == '__main__':
#get_all_ip()
ip = get_all_ip()
for x in ip: #一行一行显示
print(x)

4、运行测试
image.png
5、当我们想要使用代理的时候,使用random随机获取几个就可以了
6、加了随机获取*条数据的函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#  _*_coding:utf-8_*_
import requests
import re
import random
from scrapy.selector import Selector

def get_all_ip():

headers = {
'Upgrade-Insecure-Requests': '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/59.0.3071.115 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.xicidaili.com/nt/',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}

for i in range(1,2):
url = 'http://www.xicidaili.com/nt/%d' % i
resp = requests.get(url=url,headers=headers)
selector = Selector(text=resp.text)
all_info = selector.xpath('//*[@id="ip_list"]//tr')
#print(all_info)

ip_lists = []
for ip in all_info[1:]:
speed = ip.xpath('td[7]/div/@title').extract()[0]
if speed:
speed_str = float(speed.split('秒')[0])
ip_li = ip.xpath('td[2]/text()').extract()[0]
type_str = ip.xpath('td[6]/text()').extract()[0].lower() #大写转小写
port_str = ip.xpath('td[3]/text()').extract()[0]
# ip_lists.append((type_str,ip_li,port_str))
try:
proxies = {type_str: type_str + '://' + ip_li + ':' +port_str}
req = requests.get('http://httpbin.org/ip',headers=headers,proxies=proxies,timeout=2)
except:
#print('invalid ip and port')
#return False
pass
else:
if ((200 <= req.status_code) and (req.status_code < 300)):
#print(req.text)
ip_lists.append((type_str,ip_li,port_str))
#ip_lists.append((type_str + '://' + ip_li + ':' + port_str))
else:
pass
return ip_lists

def random_ip(ip_lists):
proxies = random.sample(ip_lists,6) #指定随机获取多少条数据
print(proxies)

def write(ip_lists):
with open("proxy.txt",'w') as f:
for ip in ip_lists:
f.write(ip +'\n')

if __name__ == '__main__':
print('-'*60)
print("程序启动中")
print('-'*60)
print("正在获取数据,请稍后....")
ip = get_all_ip()
print('随机获取*条数据')
print('-'*60)
random_ip(ip)
print("well done!")