Source code for icrawler.storage.google_storage

from icrawler.storage import BaseStorage
from io import BytesIO


[docs]class GoogleStorage(BaseStorage): """Google Storage backend. The id is filename and data is stored as text files or binary files. The root_dir is the bucket address such as gs://<your_bucket>/<your_directory>. """ def __init__(self, root_dir): try: from google.cloud import storage except ImportError: print('GoogleStorage backend requires the package ' '"google-cloud-storage", execute ' '"pip install google-cloud-storage" to install it.') self.client = storage.Client() bucket_str = root_dir[5:].split('/')[0] self.bucket = self.client.get_bucket(bucket_str) self.folder_str = root_dir[6 + len(bucket_str):] if self.folder_str[0] == '/': self.folder_str = self.folder_str[1:]
[docs] def write(self, id, data): blob = self.bucket.blob(self.folder_str + '/' + id) data_buffer = BytesIO(data) blob.upload_from_file(file_obj=data_buffer, size=len(data))
[docs] def exists(self, id): blob = self.bucket.blob(self.folder_str + '/' + id) return blob.exists()
[docs] def max_file_idx(self): return len(self.bucket.list_blobs(prefix=self.folder_str))