Source code for icrawler.feeder

# -*- coding: utf-8 -*-

import os.path as osp
from threading import current_thread

from icrawler.utils import ThreadPool


[docs]class Feeder(ThreadPool): """Base class for feeder. A thread pool of feeder threads, in charge of feeding urls to parsers. Attributes: thread_num (int): An integer indicating the number of threads. global_signal (Signal): A :class:`Signal` object for communication among all threads. out_queue (Queue): A queue connected with parsers' inputs, storing page urls. session (Session): A session object. logger (Logger): A logging.Logger object used for logging. workers (list): A list storing all the threading.Thread objects of the feeder. lock (Lock): A :class:`Lock` instance shared by all feeder threads. """ def __init__(self, thread_num, signal, session): """Init Feeder with some shared variables.""" super(Feeder, self).__init__( thread_num=thread_num, in_queue=None, name='feeder') self.signal = signal self.session = session
[docs] def feed(self, **kwargs): """Feed urls. This method should be implemented by users. """ raise NotImplementedError
[docs] def worker_exec(self, **kwargs): """Target function of workers""" self.feed(**kwargs) self.logger.info('thread {} exit'.format(current_thread().name))
def __exit__(self): self.logger.info('all feeder threads exited')
[docs]class UrlListFeeder(Feeder): """Url list feeder which feed a list of urls"""
[docs] def feed(self, url_list, offset=0, max_num=0): if isinstance(url_list, str): if osp.isfile(url_list): with open(url_list, 'r') as fin: url_list = [line.rstrip('\n') for line in fin] else: raise IOError('url list file {} not found'.format(url_list)) elif not isinstance(url_list, list): raise TypeError('"url_list" can only be a filename or a str list') if offset < 0 or offset >= len(url_list): raise ValueError('"offset" exceed the list length') else: if max_num > 0: end_idx = min(len(url_list), offset + max_num) else: end_idx = len(url_list) for i in range(offset, end_idx): url = url_list[i] self.out_queue.put(url) self.logger.debug('put url to url_queue: {}'.format(url))
[docs]class SimpleSEFeeder(Feeder): """Simple search engine like Feeder"""
[docs] def feed(self, url_template, keyword, offset, max_num, page_step): """Feed urls once Args: url_template: A string with parameters replaced with "{}". keyword: A string indicating the searching keyword. offset: An integer indicating the starting index. max_num: An integer indicating the max number of images to be crawled. page_step: An integer added to offset after each iteration. """ for i in range(offset, offset + max_num, page_step): url = url_template.format(keyword, i) self.out_queue.put(url) self.logger.debug('put url to url_queue: {}'.format(url))