Source code for smoothcrawler_cluster.crawler.attributes

"""*The basic attributes of crawler in SmoothCrawler-Cluster*

A crawler objects in *SmoothCrawler-Cluster* must have some basic attributes, e.g., *name*, *id_separation*, etc.
Although these attributes are necessary for each cluster crawlers, all of them are optional so the arguments of crawler
are not required. Therefore, it has a question it needs to consider: what value it should be set if it is empty value?
Absolutely, we could set it manually with any values we want. But if we want to set it more conveniently, how can it do?
For example, if we run multiple crawler instance by Docker container, do we need to set the attributes like its name for
each single containers? So this module exists for resolving this issue.

*New in version 0.2.0.*
"""

from abc import ABCMeta, abstractmethod
from typing import List, Union


[docs]class BaseCrawlerAttribute(metaclass=ABCMeta): """*The base class of all crawler's attribute* Definition of crawler's attribute objects. Currently, every crawler must have one property --- *name*. Another property *id_separation* could be set automatically with *name*. """ _instance = None _default_base_name: str = "sc-crawler" _default_id_separation: str = "_" _has_default: bool = True @property @abstractmethod def name(self) -> str: """:obj:`str`: Properties with both a getter and setter. This crawler instance name. It MUST be unique naming in cluster (the same group) for let entire crawler cluster to distinguish every one, for example, the properties *current_crawler*, *current_runner* and *current_backup* in meta-data **GroupState** would record by crawler names. This option value could be modified by Zookeeper object option *name*. """ pass @name.setter @abstractmethod def name(self, name: str) -> None: pass @property @abstractmethod def id_separation(self) -> str: """:obj:`str`: Properties with both getter and setter. The string to separate the attribute *name* value to get identity of each one crawler instance. """ pass @id_separation.setter @abstractmethod def id_separation(self, sep: Union[str, List[str]]) -> None: pass @property @abstractmethod def current_id(self) -> str: """:obj:`str`: Properties with only getter. The current identity of each one crawler instance. It MUST BE unique. """ pass @property def has_default(self) -> bool: """:obj:`bool`: Properties with both getter and setter. Whether the properties *name* and *id_separation* can have default value or not. """ return self._has_default @has_default.setter def has_default(self, has_default: bool) -> None: self._has_default = has_default
[docs] def init(self, name: str, id_separation: Union[str, List[str]]) -> None: """Initialize the object. It has order of property setter usage. So it could use this function to initial and set value to properties with specific order directly. Args: name (str): The crawler instance's name. This value would be set to property *name*. id_separation (Union[str, List[str]]): The separation of separating crawler instance's name to get its identity. This value would be set to property *id_separation*. Returns: None. """ self.name = name self.id_separation = id_separation
[docs]class NextableAttribute(BaseCrawlerAttribute): """*The one type of base crawler attribute with expected crawler's identity* This crawler attribute base class means the crawler instance's identity is expected. In the other words, it can use some specific way or logic to get the next identity to the new crawler instance if it needs to generate in cluster. """ @property @abstractmethod def next_id(self) -> str: """:obj:`str`: Properties with only getter. The next one identity of crawler instance. This identity MUST be new and unique which doesn't be used before. This function only let you know what next one is. But it won't really iterate to operate. """ pass @property @abstractmethod def iter_to_next_id(self) -> str: """:obj:`str`: Properties with only getter. The next one identity of crawler instance. This identity MUST be new and unique which doesn't be used before. This function would operate to next one, it means that if you try to get value by property *name*, it would turn to be the value which is equal to the return value of this property. """ pass
[docs]class SerialCrawlerAttribute(NextableAttribute): """*The attribute let crawler's identity to be serial* This crawler attribute generates crawler identity as serial number like 1, 2, 3, ..., etc. This is the default attribute of crawler when it runs in local site directly. """ _name: str = None _id_separation: str = None _id_cnt: int = 1 @property def name(self) -> str: if self.has_default and not self._name: self.name = "" return self._name @name.setter def name(self, name: str) -> None: if not name: if not self.id_separation: if not self.has_default: raise ValueError( "The property *id_separation* value should NOT be empty when setting the *name* " "property if it cannot have default value." ) self._name = f"{self._default_base_name}{self.id_separation}{self.current_id}" else: self._name = str(name) @property def id_separation(self) -> str: if self.has_default and not self._id_separation: self._id_separation = self._default_id_separation return self._id_separation @id_separation.setter def id_separation(self, sep: Union[str, List[str]]) -> None: def _chk_separation(ut_sep) -> bool: crawler_name_list = self.name.split(sep=ut_sep) try: int(crawler_name_list[-1]) except ValueError: return False else: return True if not sep: if self.has_default: sep = self._default_id_separation else: raise ValueError("Argument cannot be empty when it sets *id_separation* property.") if not self._name: if not isinstance(sep, (str, list)): raise TypeError("*id_separation* property setter only accept 'str' or 'list[str]' type value.") self._id_separation = sep return if isinstance(sep, str): if _chk_separation(sep): self._id_separation = sep return elif isinstance(sep, list): for one_sep in sep: if _chk_separation(one_sep): self._id_separation = one_sep return else: raise TypeError("*id_separation* property setter only accept 'str' or 'list[str]' type value.") raise ValueError(f"This separation(s) '{sep}' cannot parse anything from the crawler name '{self.name}'.") @property def current_id(self) -> str: return str(self._id_cnt) @property def next_id(self) -> str: return str(self._id_cnt + 1) @property def iter_to_next_id(self) -> str: self._id_cnt = int(self.next_id) return str(self._id_cnt)