python - Scrapy CrawlSpider doesn't crawl the first landing page -
i new scrapy , working on scraping exercise , using crawlspider. although scrapy framework works beautifully , follows relevant links, can't seem make crawlspider scrape first link (the home page / landing page). instead goes directly scrape links determined rule doesn't scrape landing page on links are. don't know how fix since not recommended overwrite parse method crawlspider. modifying follow=true/false doesn't yield results. here snippet of code:
class downloadspider(crawlspider): name = 'downloader' allowed_domains = ['bnt-chemicals.de'] start_urls = [ "http://www.bnt-chemicals.de" ] rules = ( rule(sgmllinkextractor(aloow='prod'), callback='parse_item', follow=true), ) fname = 1 def parse_item(self, response): open(str(self.fname)+ '.txt', 'a').write(response.url) open(str(self.fname)+ '.txt', 'a').write(','+ str(response.meta['depth'])) open(str(self.fname)+ '.txt', 'a').write('\n') open(str(self.fname)+ '.txt', 'a').write(response.body) open(str(self.fname)+ '.txt', 'a').write('\n') self.fname = self.fname + 1
just change callback parse_start_url
, override it:
from scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors.sgml import sgmllinkextractor class downloadspider(crawlspider): name = 'downloader' allowed_domains = ['bnt-chemicals.de'] start_urls = [ "http://www.bnt-chemicals.de", ] rules = ( rule(sgmllinkextractor(allow='prod'), callback='parse_start_url', follow=true), ) fname = 0 def parse_start_url(self, response): self.fname += 1 fname = '%s.txt' % self.fname open(fname, 'w') f: f.write('%s, %s\n' % (response.url, response.meta.get('depth', 0))) f.write('%s\n' % response.body)
Comments
Post a Comment