Source code for deepchem_server.core.cards

import ast
import datetime
import json
from typing import Dict, Optional

from deepchem_server.core import model_mappings


[docs] class Card: """Base class for cards. Provides common functionality for data and model cards including serialization and timestamp tracking. """
[docs] def __init__(self) -> None: """Initialize a Card with current timestamp.""" self.last_updated_time = datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S')
[docs] def __bytes__(self) -> bytes: """Convert card to bytes representation. Returns ------- bytes The card as bytes using UTF-8 encoding. """ return bytes(self.to_json(), encoding='utf8')
[docs] def to_json(self) -> str: """Convert card to JSON string representation. Returns ------- str JSON string representation of the card. """ return json.dumps(self, default=lambda o: o.__dict__)
[docs] def update_card(self, key: str, value) -> None: """Update a card attribute. Parameters ---------- key : str The attribute name to update. value : Any The new value for the attribute. Returns ------- None """ setattr(self, key, value)
[docs] class DataCard(Card): """Class for storing data card attributes. Parameters ---------- address : str Address of the reference object in the datastore. file_type : str The file extension - ex. csv filetype, .json file type etc. data_type : str The type of object stored at the location pointed by filename - ex: pd.DataFrame, dask.dataframe.DataFrame. shape : tuple, optional Shape of the data object. description : str, optional A description about the datastore. featurizer : str, optional The featurizer used in the dataset. intended_use : str, optional Notes on dataset - the intended use of the dataset. caveats : str, optional Notes on dataset - the caveats in using the dataset. feat_kwargs : dict, optional Keyword arguments for featurizer (used when featurizer is not None). **kwargs Additional attributes to set on the card. Notes ----- Difference between data_type and file_type: An example can illustrate this better. A csv file (file_type) can either be a pandas.DataFrame or dask.dataframe.DataFrame or just a csv file. The file_type holds the file extension ('csv') while data_type refers to the data object (pandas.DataFrame, dask.dataframe.DataFrame, etc). """ SUPPORTED_DATA_TYPES = [ 'pandas.DataFrame', 'dc.data.NumpyDataset', 'dc.data.DiskDataset', 'json', 'text/plain', 'png', 'binary', ] SUPPORTED_FILE_TYPES = [ 'csv', 'dir', 'json', 'pdb', 'fasta', 'fastq', 'png', 'sdf', 'dcd', 'txt', 'xml', 'py', 'pdbqt', 'zip', 'smi', 'smiles', 'bz2', 'cxsmiles', 'onnx', 'hdf5', 'log' ]
[docs] def __init__(self, address: str, file_type: str, data_type: str, shape=None, description: Optional[str] = None, featurizer: Optional[str] = None, intended_use: Optional[str] = None, caveats: Optional[str] = None, feat_kwargs: Optional[Dict] = None, **kwargs) -> None: """Initialize a DataCard.""" super().__init__() data_type = self.validate_datatype(data_type) if not isinstance(address, str): raise TypeError("address must be a string") if not isinstance(file_type, str): raise TypeError("file_type must by a string") if not isinstance(data_type, str): raise TypeError("data_type must be a string") assert file_type in self.SUPPORTED_FILE_TYPES, 'Filetype {} is not supported. Supported data type are {}'.format( file_type, self.SUPPORTED_FILE_TYPES) self.shape = shape self.address = address self.file_type = file_type self.data_type = data_type self.description = description self.featurizer = featurizer self.intended_use = intended_use self.caveats = caveats self.feat_kwargs = feat_kwargs # FIXME we should not depend on kwargs as internal parameters. These are features # for users to store additional details. for key, value in kwargs.items(): setattr(self, key, value)
[docs] def validate_datatype(self, data_type: str) -> str: """Validate and normalize data type name. Parameters ---------- data_type : str The data type to validate. Returns ------- str The validated and normalized data type. Raises ------ AssertionError If the data type is not supported. """ # expand data type name if data_type == 'DataFrame': data_type = 'pandas.DataFrame' elif data_type == 'DiskDataset': data_type = 'dc.data.DiskDataset' elif data_type == 'NumpyDataset': data_type = 'dc.data.NumpyDataset' assert data_type in self.SUPPORTED_DATA_TYPES, 'Datatype {} is not supported. Supported file type are {}'.format( data_type, self.SUPPORTED_DATA_TYPES) return data_type
[docs] @classmethod def from_json(cls, json_data: str) -> "DataCard": """Create DataCard from JSON string. Parameters ---------- json_data : str JSON string representation of the DataCard. Returns ------- DataCard DataCard instance created from the JSON data. """ args = json.loads(json_data) return cls(**args)
# Note: The above method may fail if `DataCard` contains nested objects. # return json.loads(json_data, object_hook=lambda d: DataCard(**d))
[docs] @classmethod def from_bytes(cls, card_bytes: bytes) -> "DataCard": """Create DataCard from bytes. Parameters ---------- card_bytes : bytes Bytes representation of the DataCard. Returns ------- DataCard DataCard instance created from the bytes data. """ return DataCard.from_json(card_bytes.decode('utf8'))
[docs] def get_n_samples(self) -> int: """Get the number of samples in the dataset. Returns ------- int Number of samples in the dataset. Raises ------ ValueError If the dataset does not have shape information. """ if self.shape is None: raise ValueError("the dataset does not have shape") if self.data_type == 'pandas.DataFrame': return self.shape[0] # shape of dataframe is (n_rows x n_cols) elif self.data_type in ['dc.data.DiskDataset', 'dc.data.NumpyDataset']: # A deepchem dataset has shape (X_shape, y_shape, w_shape, ids_shape) x_shape = self.shape[0] return x_shape[0] else: return self.shape
[docs] def to_json(self) -> str: """Convert DataCard to JSON string. Returns ------- str JSON string representation of the DataCard. """ json_str = json.dumps(self, default=lambda o: o.__dict__) return json_str.replace('_shape', 'shape')
@property def shape(self): """Get the shape of the data. Returns ------- tuple Shape of the data as a tuple. """ # FIXME This might pose security risk if user # arbitrarily sets card shape return ast.literal_eval(self._shape) @shape.setter def shape(self, value) -> None: """Set the shape of the data. Parameters ---------- value : tuple or None Shape of the data to set. """ self._shape = str(value)
[docs] class ModelCard(Card): """Class for storing model card attributes. Parameters ---------- address : str The address of model in the datastore. model_type : str The type of model. Ex: dc.models.RandomForest. train_dataset_address : str Training dataset used to train the model. description : str, optional A description about the model. featurizer : str, optional The featurizer used in the dataset. intended_use : str, optional Notes on dataset - the intended use of the dataset. caveats : str, optional Notes on dataset - the caveats in using the dataset. init_kwargs : dict, optional Initialization kwargs for the model ex: n_layers. train_kwargs : dict, optional Training kwargs for the model ex: n_epochs. **kwargs Additional attributes to set on the model card. """ SUPPORTED_MODEL_TYPES = list(model_mappings.model_address_map.keys())
[docs] def __init__(self, address: str, model_type: str, train_dataset_address: str, description: Optional[str] = None, featurizer: Optional[str] = None, intended_use: Optional[str] = None, caveats: Optional[str] = None, init_kwargs: Optional[Dict] = {}, train_kwargs: Optional[Dict] = {}, **kwargs) -> None: """Initialize a ModelCard.""" super().__init__() if not isinstance(address, str): raise TypeError("address must be a string") if not isinstance(model_type, str): raise TypeError("model_type must by a string") if not isinstance(train_dataset_address, str): raise TypeError("train_dataset_address must be a string") assert model_type in self.SUPPORTED_MODEL_TYPES, 'Model type {} is not supported. Supported model types are {}'.format( model_type, ' '.join(self.SUPPORTED_MODEL_TYPES)) self.address = address self.model_type = model_type self.train_dataset_address = train_dataset_address # Note: We don't have datatype here because we will be only storing # models of type dc.model.Models self.description = description self.featurizer = featurizer self.intended_use = intended_use self.caveats = caveats self.init_kwargs = init_kwargs self.train_kwargs = train_kwargs self.pretrained_model_address: Optional[str] = None for key, value in kwargs.items(): setattr(self, key, value)
[docs] @classmethod def from_json(cls, json_data: str) -> "ModelCard": """Create ModelCard from JSON string. Parameters ---------- json_data : str JSON string representation of the ModelCard. Returns ------- ModelCard ModelCard instance created from the JSON data. """ data = json.loads(json_data) return cls(**data)
[docs] @classmethod def from_bytes(cls, card_bytes: bytes) -> "ModelCard": """Create ModelCard from bytes. Parameters ---------- card_bytes : bytes Bytes representation of the ModelCard. Returns ------- ModelCard ModelCard instance created from the bytes data. """ return ModelCard.from_json(card_bytes.decode('utf8'))