爬虫(20):深度爬取策略(2)
import re import requests headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} """ 世界杯八强出炉 """ def getUrl(url): html = getHtml(url) urlre = "\"(https?://.*?)\".*>" urllist = re.findall(urlre, html) return urllist def getHtml(url): response = requests.get(url, headers=headers).content.decode('utf-8', 'ignore') return response def vastSpider(depth): while len(urlList) > 0: url = urlList.pop() if urlDict[url] <= depth: print('\t\t\t\t' * urlDict[url], '这是第%d层,%s' % (urlDict[url], url)) sonurlList = getUrl(url) for newurl in sonurlList: if newurl not in urlDict: urlDict[newurl] = urlDict[url] + 1 urlList.append(newurl) if __name__ == '__main__': startUrl = 'https://www.baidu.com/s?wd=世界杯' urlList = [] urlList.append(startUrl) urlDict = {} urlDict[startUrl] = 1 vastSpider(4)