import scrapy
from tiantianmeiju.items import TiantianmeijuItem
import sys
reload(sys) # Python2.5 初始化后会删除 sys.setdefaultencoding 这个方法,我们需要重新载入
sys.setdefaultencoding('utf-8')
class CacthUrlSpider(scrapy.Spider):
name = 'meiju'
allowed_domains = ['cn163.net']
start_urls = ["http://cn163.net/archives/{id}/".format(id=id) for id in ['16355', '13470', '18766', '18805']]
def parse(self, response):
item = TiantianmeijuItem()
item['name'] = response.xpath('//*[@id="content"]/div[2]/div[2]/h2/text()').extract()
item['p_w_picpath_urls'] = response.xpath('//*[@id="entry"]/div[2]/img/@src').extract()
item['episode'] = response.xpath('//*[@id="entry"]/p[last()]/a/text()').extract()
item['episode_url'] = response.xpath('//*[@id="entry"]/p[last()]/a/@href').extract()
yield item</pre>页面比较简单<br />Pipelines:这里写了两个管道,一个是把下载链接保存到文件,一个是下载图片<pre class="brush:python;toolbar:false">import json
import os
from scrapy.pipelines.p_w_picpaths import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
from settings import IMAGES_STORE
class TiantianmeijuPipeline(object):
def process_item(self, item, spider):
return item
class WriteToFilePipeline(object):
def process_item(self, item, spider):
item = dict(item)
FolderName = item['name'][0].replace('/', '')
downloadFile = 'download_urls.txt'
with open(os.path.join(IMAGES_STORE, FolderName, downloadFile), 'w') as file:
for name,url in zip(item['episode'], item['episode_url']):
file.write('{name}: {url}\n'.format(name=name, url=url))
return item
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for p_w_picpath_url in item['p_w_picpath_urls']:
yield Request(p_w_picpath_url, meta={'item': item})
def item_completed(self, results, item, info):
p_w_picpath_paths = [x['path'] for ok,x in results if ok]
if not p_w_picpath_paths:
raise DropItem("Item contains no p_w_picpaths")
item['p_w_picpath_paths'] = p_w_picpath_paths