python初学爬虫,使用urllib.request模块,爬取众筹网相关内容


#python 3.6
import mysql.connector
import re
import urllib

conn = mysql.connector.connect(user='root', password='root',host = 'localhost',database = 'test')
cursor = conn.cursor()

def getHtml(url):
    req = urllib.request.urlopen(url).read()
    html = req.decode('utf-8')
    return html

def getData(html,leibie):
    reg = re.compile('
(.*?)

筹款进度

',re.S) xiangmu = re.findall(reg,html) # print(xiangmu) shengfen = ['河北','山西','辽宁','吉林','黑龙江','江苏','浙江','安徽','福建', '江西','山东','河南','湖北','湖南','广东','海南','四川','贵州','云南', '陕西','甘肃','青海','台湾','内蒙古','广西','西藏','宁夏','新疆','香港','澳门'] zhixiashi = ['北京','天津','上海','重庆'] for x in range(len(xiangmu)): name = re.findall('class="siteCardICH3" title="(.*.)" target="_blank"',xiangmu[x]) # print(name) yichouzhichijindu = re.findall('

(.*.)

',xiangmu[x]) label = re.findall('site_ALink siteIlB_item" target="_blank">(.*)',xiangmu[x]) index = 0 while 1: if label[index] in shengfen: province = label[index] city = label[index+1] index += 1 break elif label[index] in zhixiashi: province = label[index] city = '' index += 1 break else: index += 1 name = name[0].replace("'","“") cursor.execute("""insert into test(项目名称,已筹款,支持数,筹款进度,省份or直辖市,市,类别) values('%s','%s','%s','%s','%s','%s','%s')""" %(name,yichouzhichijindu[0][1:-1],yichouzhichijindu[1],yichouzhichijindu[2],province,city,leibie)) conn.commit() def endPage(html): temp = re.findall('normalPage">(.*)',html) endpage = int(temp[-1]) return endpage def main(): print('begin') for i in [1,2,3,4,5]: urleibie = 'http://www.zhongchou.com/browse/id-28-tid-4'+str(i)+'-sm-p' types = {1:'生物科技',2:'果蔬种植',3:'生态养殖',4:'茶酒饮品',5:'休闲零食'} leibie = types[i] # print('%s:'%leibie) ii = 1 while 1: url = urleibie + str(ii) html = getHtml(url) getData(html,leibie) endpage = endPage(html) print('page %s has finished'%ii) ii += 1 if ii > endpage: break print('type "%s" has finished'%leibie) cursor.close() conn.close() print('all finished') main()

大创项目做得一个关于融资达成率的东东,就自学了下爬虫,这个版本使用mysql数据库存取