import ast
import datetime
import json
from typing import Dict, Optional
from deepchem_server.core import model_mappings
[docs]
class Card:
"""Base class for cards.
Provides common functionality for data and model cards including
serialization and timestamp tracking.
"""
[docs]
def __init__(self) -> None:
"""Initialize a Card with current timestamp."""
self.last_updated_time = datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S')
[docs]
def __bytes__(self) -> bytes:
"""Convert card to bytes representation.
Returns
-------
bytes
The card as bytes using UTF-8 encoding.
"""
return bytes(self.to_json(), encoding='utf8')
[docs]
def to_json(self) -> str:
"""Convert card to JSON string representation.
Returns
-------
str
JSON string representation of the card.
"""
return json.dumps(self, default=lambda o: o.__dict__)
[docs]
def update_card(self, key: str, value) -> None:
"""Update a card attribute.
Parameters
----------
key : str
The attribute name to update.
value : Any
The new value for the attribute.
Returns
-------
None
"""
setattr(self, key, value)
[docs]
class DataCard(Card):
"""Class for storing data card attributes.
Parameters
----------
address : str
Address of the reference object in the datastore.
file_type : str
The file extension - ex. csv filetype, .json file type etc.
data_type : str
The type of object stored at the location pointed by filename -
ex: pd.DataFrame, dask.dataframe.DataFrame.
shape : tuple, optional
Shape of the data object.
description : str, optional
A description about the datastore.
featurizer : str, optional
The featurizer used in the dataset.
intended_use : str, optional
Notes on dataset - the intended use of the dataset.
caveats : str, optional
Notes on dataset - the caveats in using the dataset.
feat_kwargs : dict, optional
Keyword arguments for featurizer (used when featurizer is not None).
**kwargs
Additional attributes to set on the card.
Notes
-----
Difference between data_type and file_type:
An example can illustrate this better. A csv file (file_type) can either be
a pandas.DataFrame or dask.dataframe.DataFrame or just a csv file. The file_type
holds the file extension ('csv') while data_type refers to the data
object (pandas.DataFrame, dask.dataframe.DataFrame, etc).
"""
SUPPORTED_DATA_TYPES = [
'pandas.DataFrame',
'dc.data.NumpyDataset',
'dc.data.DiskDataset',
'json',
'text/plain',
'png',
'binary',
]
SUPPORTED_FILE_TYPES = [
'csv', 'dir', 'json', 'pdb', 'fasta', 'fastq', 'png', 'sdf', 'dcd', 'txt', 'xml', 'py', 'pdbqt', 'zip', 'smi',
'smiles', 'bz2', 'cxsmiles', 'onnx', 'hdf5', 'log'
]
[docs]
def __init__(self,
address: str,
file_type: str,
data_type: str,
shape=None,
description: Optional[str] = None,
featurizer: Optional[str] = None,
intended_use: Optional[str] = None,
caveats: Optional[str] = None,
feat_kwargs: Optional[Dict] = None,
**kwargs) -> None:
"""Initialize a DataCard."""
super().__init__()
data_type = self.validate_datatype(data_type)
if not isinstance(address, str):
raise TypeError("address must be a string")
if not isinstance(file_type, str):
raise TypeError("file_type must by a string")
if not isinstance(data_type, str):
raise TypeError("data_type must be a string")
assert file_type in self.SUPPORTED_FILE_TYPES, 'Filetype {} is not supported. Supported data type are {}'.format(
file_type, self.SUPPORTED_FILE_TYPES)
self.shape = shape
self.address = address
self.file_type = file_type
self.data_type = data_type
self.description = description
self.featurizer = featurizer
self.intended_use = intended_use
self.caveats = caveats
self.feat_kwargs = feat_kwargs
# FIXME we should not depend on kwargs as internal parameters. These are features
# for users to store additional details.
for key, value in kwargs.items():
setattr(self, key, value)
[docs]
def validate_datatype(self, data_type: str) -> str:
"""Validate and normalize data type name.
Parameters
----------
data_type : str
The data type to validate.
Returns
-------
str
The validated and normalized data type.
Raises
------
AssertionError
If the data type is not supported.
"""
# expand data type name
if data_type == 'DataFrame':
data_type = 'pandas.DataFrame'
elif data_type == 'DiskDataset':
data_type = 'dc.data.DiskDataset'
elif data_type == 'NumpyDataset':
data_type = 'dc.data.NumpyDataset'
assert data_type in self.SUPPORTED_DATA_TYPES, 'Datatype {} is not supported. Supported file type are {}'.format(
data_type, self.SUPPORTED_DATA_TYPES)
return data_type
[docs]
@classmethod
def from_json(cls, json_data: str) -> "DataCard":
"""Create DataCard from JSON string.
Parameters
----------
json_data : str
JSON string representation of the DataCard.
Returns
-------
DataCard
DataCard instance created from the JSON data.
"""
args = json.loads(json_data)
return cls(**args)
# Note: The above method may fail if `DataCard` contains nested objects.
# return json.loads(json_data, object_hook=lambda d: DataCard(**d))
[docs]
@classmethod
def from_bytes(cls, card_bytes: bytes) -> "DataCard":
"""Create DataCard from bytes.
Parameters
----------
card_bytes : bytes
Bytes representation of the DataCard.
Returns
-------
DataCard
DataCard instance created from the bytes data.
"""
return DataCard.from_json(card_bytes.decode('utf8'))
[docs]
def get_n_samples(self) -> int:
"""Get the number of samples in the dataset.
Returns
-------
int
Number of samples in the dataset.
Raises
------
ValueError
If the dataset does not have shape information.
"""
if self.shape is None:
raise ValueError("the dataset does not have shape")
if self.data_type == 'pandas.DataFrame':
return self.shape[0] # shape of dataframe is (n_rows x n_cols)
elif self.data_type in ['dc.data.DiskDataset', 'dc.data.NumpyDataset']:
# A deepchem dataset has shape (X_shape, y_shape, w_shape, ids_shape)
x_shape = self.shape[0]
return x_shape[0]
else:
return self.shape
[docs]
def to_json(self) -> str:
"""Convert DataCard to JSON string.
Returns
-------
str
JSON string representation of the DataCard.
"""
json_str = json.dumps(self, default=lambda o: o.__dict__)
return json_str.replace('_shape', 'shape')
@property
def shape(self):
"""Get the shape of the data.
Returns
-------
tuple
Shape of the data as a tuple.
"""
# FIXME This might pose security risk if user
# arbitrarily sets card shape
return ast.literal_eval(self._shape)
@shape.setter
def shape(self, value) -> None:
"""Set the shape of the data.
Parameters
----------
value : tuple or None
Shape of the data to set.
"""
self._shape = str(value)
[docs]
class ModelCard(Card):
"""Class for storing model card attributes.
Parameters
----------
address : str
The address of model in the datastore.
model_type : str
The type of model. Ex: dc.models.RandomForest.
train_dataset_address : str
Training dataset used to train the model.
description : str, optional
A description about the model.
featurizer : str, optional
The featurizer used in the dataset.
intended_use : str, optional
Notes on dataset - the intended use of the dataset.
caveats : str, optional
Notes on dataset - the caveats in using the dataset.
init_kwargs : dict, optional
Initialization kwargs for the model ex: n_layers.
train_kwargs : dict, optional
Training kwargs for the model ex: n_epochs.
**kwargs
Additional attributes to set on the model card.
"""
SUPPORTED_MODEL_TYPES = list(model_mappings.model_address_map.keys())
[docs]
def __init__(self,
address: str,
model_type: str,
train_dataset_address: str,
description: Optional[str] = None,
featurizer: Optional[str] = None,
intended_use: Optional[str] = None,
caveats: Optional[str] = None,
init_kwargs: Optional[Dict] = {},
train_kwargs: Optional[Dict] = {},
**kwargs) -> None:
"""Initialize a ModelCard."""
super().__init__()
if not isinstance(address, str):
raise TypeError("address must be a string")
if not isinstance(model_type, str):
raise TypeError("model_type must by a string")
if not isinstance(train_dataset_address, str):
raise TypeError("train_dataset_address must be a string")
assert model_type in self.SUPPORTED_MODEL_TYPES, 'Model type {} is not supported. Supported model types are {}'.format(
model_type, ' '.join(self.SUPPORTED_MODEL_TYPES))
self.address = address
self.model_type = model_type
self.train_dataset_address = train_dataset_address
# Note: We don't have datatype here because we will be only storing
# models of type dc.model.Models
self.description = description
self.featurizer = featurizer
self.intended_use = intended_use
self.caveats = caveats
self.init_kwargs = init_kwargs
self.train_kwargs = train_kwargs
self.pretrained_model_address: Optional[str] = None
for key, value in kwargs.items():
setattr(self, key, value)
[docs]
@classmethod
def from_json(cls, json_data: str) -> "ModelCard":
"""Create ModelCard from JSON string.
Parameters
----------
json_data : str
JSON string representation of the ModelCard.
Returns
-------
ModelCard
ModelCard instance created from the JSON data.
"""
data = json.loads(json_data)
return cls(**data)
[docs]
@classmethod
def from_bytes(cls, card_bytes: bytes) -> "ModelCard":
"""Create ModelCard from bytes.
Parameters
----------
card_bytes : bytes
Bytes representation of the ModelCard.
Returns
-------
ModelCard
ModelCard instance created from the bytes data.
"""
return ModelCard.from_json(card_bytes.decode('utf8'))