class FinallPageParser(HTMLParser):
def __init__(self):self.handledtags=['div','h1','strong','a','del','div','img','li','span','tbody','tr','th','td','i']self.processing=Noneself.title=''self.jdprice=''self.refprice=''self.partimgs_show=set()#展示图片self.partimgs=set()#详情图片self.partdetail={}#商品详情,参数等self.specification=[]#规格参数self.typeOrsize=set()#尺码和类型self.div=''self.flag={}self.flag['refprice']=''self.flag['title']=''self.flag['jdprice']=''self.flag['typeOrsize']=''self.flag['partimgs']=''self.flag['partdetail']=''self.flag['specification']=''self.flag['typeOrsize']=''self.link=''self.partslinks={}HTMLParser.__init__(self) def handle_starttag(self, tag, attrs):self.titleflag=''self.flag['refprice']=''self.flag['title']=''self.flag['jdprice']=''self.flag['typeOrsize']=''self.flag['partimgs']=''self.flag['partdetail']=''self.flag['specification']=''self.flag['typeOrsize']=''if tag in self.handledtags: self.data='' self.processing=tag if tag=='div': for key,value in attrs: self.div=value# 取出div的name,判断是否是所需要的图片等元素 if tag=='i': self.flag['typeOrsize']='match' if tag=='a' and len(attrs)==2: tmpflag="" for key,value in attrs: if key=='href' and re.search(r'^http:\/\/item.jd.com\/[0-9]{1,10}.html$',value): tmpflag="first" if key=='title' and value!="": tmpflag=tmpflag+"second" if tmpflag== "firstsecond": self.flag['typeOrsize']='match' if tag=='h1': self.flag['title']='match' if tag=='strong' and len(attrs)==2: for tmpclass,id in attrs: if id=='jd-price': self.flag['jdprice']='match' if tag=='del': self.flag['refprice']='match' if tag=='li': self.flag['partdetail']='match' if tag=='th' or tag=='tr' or tag=='td' :#++++++++############################################879498.html td中有br的只取到第一个,需要把<br/>喜欢为“” self.flag['specification']='match' if tag=='img' : imgtmp_flag='' imgtmp='' for key,value in attrs: if re.search(r'^http://img.*jpg|^http://img.gif|^http://img.png',str(value)) and (key=='src' or key=='data-lazyload'): imgtmp=value if key== 'width':############可能还有logo if re.search(r'^\d{1,9}$',value): if int(value)<=160: imgtmp_flag='no' break if self.div=="spec-items" and imgtmp!='': imgtmp=re.compile("/n5/").sub("/n1/",imgtmp) self.partimgs_show.add(imgtmp) elif imgtmp_flag!='no' and imgtmp!='': self.partimgs.add(imgtmp)# def handle_data(self, data):if self.processing: self.data+=data if self.flag['title']=='match':#获取成功 self.title=data if self.flag['jdprice']=='match': self.jdprice=data.strip() if self.flag['typeOrsize']=='match': self.typeOrsize.add(data.strip()) if self.flag['refprice']=='match': self.refprice=data.strip() if self.flag['partdetail']=='match' and re.search(r':',data):#获取成功 keytmp=data.split(":")[0].strip() valuetmp=data.split(":")[1].strip() self.partdetail[keytmp]=valuetmp if self.flag['specification']=='match' and data.strip() != '' and data.strip()!='主体': self.specification.append(data.strip())else: pass def handle_endtag(self, tag):if tag==self.processing: self.processing=None def getdata(self):return {'title':self.title,'partimgs_show':self.partimgs_show,'jdprice':self.jdprice,'refprice':self.refprice,'partimgs':self.partimgs,'partdetail':self.partdetail,'specification':self.specification,'typeOrsize':self.typeOrsize} 定义方法httpread,用于发起http的get请求,返回http的获取内容
def judgelist(listpageurl,finallylistpageurl):#判断第一个、最后一个的list页面的所有的html是否下载完毕,以此判断该类型是否处理完毕
judgelist_flag=True
parseListpageurl_rult_finally=parseListpageurl(finallylistpageurl)
finalparseurls_deep_finally=list(parseListpageurl_rult_finally['finalparseurls'])#获取到最后的需要解析的url的列表
parseListpageurl_rult_first=parseListpageurl(listpageurl)
finalparseurls_deep_first=list(parseListpageurl_rult_first['finalparseurls'])#获取到最后的需要解析的url的列表
for finalparseurl in finalparseurls_deep_finally:#print finalparseurlif judgeurl(finalparseurl): passelse: judgelist_flag=False break if judgelist_flag==True:for finalparseurl_first in finalparseurls_deep_first: #print finalparseurl if judgeurl(finalparseurl_first): pass else: judgelist_flag=False break return judgelist_flag 整体控制的run方法
def run():
partlists={'http://list.jd.com/list.html?cat=737,794,870':'空调'}
partlistskeys=partlists.keys()
for listpageurl in partlistskeys:parseListpageurl_rult=parseListpageurl(listpageurl)#开始解析list页面,如:http://list.jd.com/list.html?cat=737,794,870totalPageNo=parseListpageurl_rult['totalPageNo']#获取该list总共有多少页#print 'debug:totalPageNo',totalPageNofinallylistpageurl=listpageurl+'&page='+str(int(totalPageNo)+1)+'&JL=6_0_0'#拼接出最后一个list页面(list页面有1、2、3。。。n页)#print 'debug:finallylistpageurl ',finallylistpageurlif judgelist(listpageurl,finallylistpageurl):#如果该list已经爬取完毕了。那么,就跳过这个list print datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+',All html done for '+str(listpageurl)+":"+str(partlists[listpageurl])+"【Done Done】,【^_^】" continueelse:#否则就逐个沿着list,从其第1页,开始往下爬取 for i in range(1,int(totalPageNo)+2): finalparseurl='' listpageurl_next=listpageurl+'&page='+str(i)+'&JL=6_0_0' #print "debug:listpageurl_next",listpageurl_next parseListpageurl_rult=parseListpageurl(listpageurl_next) totalPageNo=parseListpageurl_rult['totalPageNo']#需要更行总的页面数量,以免数据陈旧 finalparseurls_deep=list(parseListpageurl_rult['finalparseurls']) for finalparseurl in finalparseurls_deep: if judgeurl(finalparseurl):#判断该具体的url是否已经爬取 print 'finalparseurl pass yet:'+finalparseurl pass else: finalurl_content=getfinalurl_content(partlists,listpageurl,finalparseurl) finalparseurl_tmp=finalparseurl+"\n" with open("data.txt","a") as datafile:#将爬取完毕好的url写入data.txt datafile.writelines(finalurl_content+"\n") with open("judgeurl.txt","a") as judgefile:#将已经爬取好的url写入judgeurl.txt judgefile.writelines(finalparseurl+"\n") bisect.insort_right(judgeurl_all_lines,finalparseurl+"\n") 主方法
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf8')#设置系统默认编码是utf8
socket.setdefaulttimeout(5)#设置全局超时时间
global judgeurl_all_lines#设置全局变量
#不存在文件就创建文件,该文件用于记录哪些url是爬取过的,如果临时中断了,可以直接重启脚本即可
if not os.path.exists("judgeurl.txt"):with open("judgeurl.txt",'w') as judgefile: judgefile.close() #每次运行只在开始的时候读取一次,新产生的数据(已怕去过的url)也会保存到judgeurl.txt
with open("judgeurl.txt","r") as judgefile:judgeurl_all_lines=judgefile.readlines() judgeurl_all_lines.sort()#排序,因为后面需要使用到二分查找,必须先排序
#启多个线程去爬取
Thread(target=run(),args=()).start()
Thread(target=run(),args=()).start()
#Thread(target=run(),args=()).start()