简单爬虫示例01
import re
import time
import gzip
import random
import urllib.request
from bs4 import BeautifulSoup
url_link = 'http://mebook.cc/download.php?id=6064'
url_link = 'http://mebook.cc/6064.html'
url_link = 'http://mebook.cc/download.php?id=7456'
url_link = 'http://www.kxdaili.com/dailiip/1/1.html'
url_link = 'https://www.kuaidaili.com/free/inha/2/'
url_link = 'http://www.xicidaili.com/wn/2'
UserAgentList = ['Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (X11; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; '
'InfoPath.3; MS-RTC LM 8; Zune 4.7)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
'Mozilla/5.0 (IE 11.0; Windows NT 6.3; Trident/7.0; .NET4.0E; .NET4.0C; rv:11.0) like Gecko',
'Mozilla/5.0 (IE 11.0; Windows NT 6.3; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/40.0.2214.38 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36']
match_obj = re.match(r'^http[s]*://([^/]*)', url_link)
if match_obj:
host = match_obj.group(1)
header = {'Host': host,
'User-Agent': UserAgentList[5],
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
print(header)
url_request = urllib.request.Request(url_link, headers=header)
# url_request.add_header('User-agent', 'Mozilla/5.0')
# url_request.add_header('User-agent', 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.656.400 QQBrowser/9.0.2524.400')
# url_request.add_header('User-Agent', random.choice(UserAgentList))
# url_request.add_header('User-Agent', UserAgentList[5])
url_open = urllib.request.urlopen(url_request, timeout=5)
html_doc = url_open.read()
html_doc = gzip.decompress(html_doc).decode('utf-8')
print(html_doc)
bt_soup = BeautifulSoup(html_doc, 'html.parser')
print(bt_soup.find_all('tr'))
# for http://www.kxdaili.com/dailiip/1/1.html
match_obj = re.search(r'<td>(\d+\.\d+.\d+.\d+)</td>\s+<td>(\d+)</td>', str(bt_soup.find_all('tr')[1]))
print(match_obj.group(0))
print(match_obj.group(1))
print(match_obj.group(2))
# for https://www.kuaidaili.com/free/inha/2/
match_obj = re.search(r'<td data-title="IP">(\d+\.\d+.\d+.\d+)</td>\s+<td data-title="PORT">(\d+)</td>\s+<td data-title="匿名度">\w+</td>\s+<td data-title="类型">([\w]+)</td>', str(bt_soup.find_all('tr')[1]))
print(match_obj.group(0))
print(match_obj.group(1))
print(match_obj.group(2))
# http://www.xicidaili.com/wn
match_obj = re.search(r'<td>(\d+\.\d+.\d+.\d+)</td>\s+<td>(\d+)</td>\s+.*?<td>(\w+)</td>', str(bt_soup.find_all('tr')[2]), re.S)
re.search(r'<td>(\d+\.\d+.\d+.\d+)</td>\s+<td>(\d+)</td>\s+.*?<td>(\w+)</td>', str(bt_soup.find_all('tr')[2]), re.S).group(0)
print(match_obj.group(0))
print(match_obj.group(1))
print(match_obj.group(2));
re.search(r'<a\s+href="/wn/(\d+)">\d+</a>\s+<a\s+class="next_page"', str(bt_soup.find_all('div', 'pagination'))).group(1)
# 2018-01-19 23:32:01,748 [line: 39] - DEBUG - IP: 122.231.38.43 Port:808
# 2018-01-19 23:32:01,751 [line: 39] - DEBUG - IP: 120.236.142.103 Port:8888
# 2018-01-19 23:32:01,753 [line: 39] - DEBUG - IP: 120.79.162.100 Port:1080
# 2018-01-19 23:32:01,756 [line: 39] - DEBUG - IP: 223.241.117.65 Port:8010
# 2018-01-19 23:32:01,758 [line: 39] - DEBUG - IP: 114.101.46.160 Port:65309
# 2018-01-19 23:32:01,760 [line: 39] - DEBUG - IP: 111.195.228.131 Port:8123
# 2018-01-19 23:32:01,762 [line: 39] - DEBUG - IP: 39.108.171.142 Port:80
# 2018-01-19 23:32:01,768 [line: 39] - DEBUG - IP: 218.60.149.144 Port:80
# 2018-01-19 23:32:01,770 [line: 39] - DEBUG - IP: 120.76.79.21 Port:80
# 2018-01-19 23:32:01,779 [line: 39] - DEBUG - IP: 121.40.65.178 Port:8080
op_result = pandas.DataFrame(columns=['ip', 'port'])
print(hex(id(op_result)))
def op_pandas(op_result):
print(hex(id(op_result)))
op_result = op_result.append({'ip': '122.231.38.43', 'port': '808'}, ignore_index=True)
print(hex(id(op_result)))
op_result = op_result.append({'ip': '120.236.142.103', 'port': '8888'}, ignore_index=True)
print(hex(id(op_result)))
op_result = op_result.append({'ip': '120.79.162.100', 'port': '1080'}, ignore_index=True)
print(hex(id(op_result)))
op_result = op_result.append({'ip': '223.241.117.65', 'port': '8010'}, ignore_index=True)
print(hex(id(op_result)))
# print(op_result)
op_pandas(op_result)
print(hex(id(op_result)))
print(op_result)
base_url = 'https://www.kuaidaili.com/free/inha/{}/'
UserAgentList = ['Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0',
'Mozilla/5.0 (X11; Linux i686; rv:30.0) Gecko/20100101 Firefox/30.0',
'Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; '
'InfoPath.3; MS-RTC LM 8; Zune 4.7)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
'Mozilla/5.0 (IE 11.0; Windows NT 6.3; Trident/7.0; .NET4.0E; .NET4.0C; rv:11.0) like Gecko',
'Mozilla/5.0 (IE 11.0; Windows NT 6.3; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/37.0.2062.103 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/40.0.2214.38 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36']
for index in range(1, 11):
"""
header = {'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(UserAgentList),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
"""
url_link = base_url.format(index)
# url_request = urllib.request.Request(url_link, headers=header)
url_request = urllib.request.Request(url_link)
# url_request.add_header('User-agent', 'Mozilla/5.0')
# url_request.add_header('User-agent', 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.656.400 QQBrowser/9.0.2524.400')
url_request.add_header('User-Agent', random.choice(UserAgentList))
# url_request.add_header(header)
url_open = urllib.request.urlopen(url_request)
html_doc = url_open.read().decode('utf-8')
print(html_doc)
time.sleep(2)
base_url = 'http://www.xicidaili.com/wn/{}'
proxies_list = [{'http': '47.96.250.208:3128'}, {'http': '47.93.185.19:80'}, {'http': '122.4.238.66:8080'}, {'http': '183.163.46.201:42419'}, {'http': '182.92.242.11:80'}, {'http': '117.146.19.161:3128'}, {'http': '183.163.46.146:42419'}, {'http': '114.222.151.115:808'}, {'http': '121.31.198.170:8123'}, {'http': '180.121.162.86:47998'}]
for index in range(1, 11):
proxies = proxies_list[random.randrange(0, len(proxies_list))]
# proxies = {'http': '114.222.151.115:808'}
print(proxies)
url_link = base_url.format(index)
proxy_handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy_handler)
urllib.request.install_opener(opener)
# url_request.add_header('User-Agent', random.choice(UserAgentList))
url_request = urllib.request.Request(url_link)
url_request.add_header('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0')
while True:
try:
url_open = urllib.request.urlopen(url_request)
html_doc = url_open.read().decode('utf-8')
except Exception as err:
time.sleep(2)
else:
break
print(html_doc)
break
import re
import time
import gzip
import random
import urllib.request
from bs4 import BeautifulSoup
url_link = 'https://proxy.mimvp.com/free.php?proxy=in_tp&sort=&page=1'
match_obj = re.match(r'^http[s]*://([^/]*)', url_link)
if match_obj:
host = match_obj.group(1)
header = {'Host': host,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
print(header)
enable_gzip = False
if header.get('Accept-Encoding') is not None and re.search(r'gzip', str(header.get('Accept-Encoding'))):
enable_gzip = True
url_request = urllib.request.Request(url_link, headers=header)
url_open = urllib.request.urlopen(url_request, timeout=5)
if enable_gzip:
html_doc = url_open.read()
html_doc = gzip.decompress(html_doc).decode('utf-8')
else:
html_doc = url_open.read().decode('utf-8')
print(html_doc)
bt_soup = BeautifulSoup(html_doc, 'html.parser')
print(bt_soup.find_all('td', 'tbl-proxy-port'))
re.findall(r'class=\'tbl-proxy-ip\'[^>]+(\d+\.\d+\.\d+\.\d+).*?port=(\w+).*?title=\'[\w/]+\'>([\w/]+)', html_doc).group(0)