简单爬虫示例02

import re
import time
import gzip
import random
import urllib.request
from bs4 import BeautifulSoup

url_link = 'https://item.jd.com/2316993.html'

match_obj = re.match(r'^http[s]*://([^/]*)', url_link)
if match_obj:
    host = match_obj.group(1)

header = {'Host': host,
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
          'Accept-Language': 'en-US,en;q=0.5',
          # 'Accept-Encoding': 'gzip, deflate, sdch',
          'Accept-Encoding': 'gzip, deflate',
          'DNT': '1',
          'Connection': 'keep-alive',
          'Upgrade-Insecure-Requests': '1',
          }
print(header)

enable_gzip = False
if header.get('Accept-Encoding') is not None and re.search(r'gzip', str(header.get('Accept-Encoding'))):
    enable_gzip = True

url_request = urllib.request.Request(url_link, headers=header)
url_open = urllib.request.urlopen(url_request, timeout=5)
if enable_gzip:
    html_doc = url_open.read()
    # print(html_doc)
    # html_doc = gzip.decompress(html_doc).decode('utf-8')
    html_doc = gzip.decompress(html_doc)

else:
    html_doc = url_open.read().decode('utf-8')

print(html_doc)

results matching ""

    No results matching ""