tanat.metadata package#

Submodules#

tanat.metadata.feature module#

Feature metadata definitions and helpers.

class tanat.metadata.feature.ArrayInfo(name: str, dtype: str, dimension: int | None)[source]#

Bases: FeatureInfo

Metadata for array features (fixed-size Array or variable-size List).

__init__(name: str, dtype: str, dimension: int | None) None[source]#
dimension: int | None[source]#
classmethod from_stats(col_name: str, dtype: Any, stats: dict) ArrayInfo[source]#

Factory: Builds instance from stats.

classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns polars expressions needed to compute metadata.

property summary: str[source]#

User-facing type summary (without the feature name).

class tanat.metadata.feature.BooleanInfo(name: str, dtype: str, true_count: int | None, false_count: int | None)[source]#

Bases: FeatureInfo

Metadata for boolean features.

__init__(name: str, dtype: str, true_count: int | None, false_count: int | None) None[source]#
false_count: int | None[source]#
classmethod from_stats(col_name: str, dtype: Any, stats: dict) BooleanInfo[source]#

Builds a BooleanInfo from precomputed stats.

classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns polars expressions needed to compute metadata.

property summary: str[source]#

User-facing type summary (without the feature name).

true_count: int | None[source]#
class tanat.metadata.feature.CategoricalInfo(name: str, dtype: str, n_unique: int | None, ordered: bool = False)[source]#

Bases: FeatureInfo

Metadata for categorical/string features.

Note

n_unique is computed from the first 10 000 rows sampled by build_feature_metadata(). On large datasets this may undercount the actual number of distinct categories.

__init__(name: str, dtype: str, n_unique: int | None, ordered: bool = False) None[source]#
classmethod from_stats(col_name: str, dtype: Any, stats: dict) CategoricalInfo[source]#

Builds a CategoricalInfo from precomputed stats.

Only Enum dtype is considered ordered (user-defined order); standard Categorical uses lexical ordering.

classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns n_unique aggregation expression.

n_unique: int | None[source]#
ordered: bool = False[source]#
property summary: str[source]#

User-facing type summary (without the feature name).

class tanat.metadata.feature.FeatureInfo(name: str, dtype: str)[source]#

Bases: ABC

Base class for feature metadata.

__init__(name: str, dtype: str) None[source]#
dtype: str[source]#
abstractmethod classmethod from_stats(col_name: str, dtype: str | DataType, stats: dict) FeatureInfo[source]#

Factory: Builds instance from stats.

abstractmethod classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns polars expressions needed to compute metadata.

name: str[source]#
abstract property summary: str[source]#

User-facing type summary (without the feature name).

to_json_dict() dict[source]#

Converts this instance to a JSON-serializable dictionary.

class tanat.metadata.feature.NumericalInfo(name: str, dtype: str, min: float | int | None, max: float | int | None)[source]#

Bases: FeatureInfo

Metadata for numerical features.

__init__(name: str, dtype: str, min: float | int | None, max: float | int | None) None[source]#
classmethod from_stats(col_name: str, dtype: str | DataType, stats: dict) NumericalInfo[source]#

Builds a NumericalInfo from precomputed stats.

classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns min and max aggregation expressions.

max: float | int | None[source]#
min: float | int | None[source]#
property summary: str[source]#

User-facing type summary (without the feature name).

class tanat.metadata.feature.StringInfo(name: str, dtype: str, min_length: int | None, max_length: int | None)[source]#

Bases: FeatureInfo

Metadata for string features.

__init__(name: str, dtype: str, min_length: int | None, max_length: int | None) None[source]#
classmethod from_stats(col_name: str, dtype: Any, stats: dict) StringInfo[source]#

Factory: Builds instance from stats.

classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns polars expressions needed to compute metadata.

max_length: int | None[source]#
min_length: int | None[source]#
property summary: str[source]#

User-facing type summary (without the feature name).

class tanat.metadata.feature.TemporalInfo(name: str, dtype: str, min: Any | None, max: Any | None)[source]#

Bases: FeatureInfo

Metadata for temporal features (Date, Time, Datetime, Duration).

__init__(name: str, dtype: str, min: Any | None, max: Any | None) None[source]#
classmethod from_stats(col_name: str, dtype: Any, stats: dict) TemporalInfo[source]#

Factory: Builds instance from stats.

classmethod get_aggregations(col_name: str, dtype: DataType | None = None) list[Expr][source]#

Returns polars expressions needed to compute metadata.

property is_duration: bool[source]#

Returns True if this feature has a pl.Duration type.

max: Any | None[source]#
min: Any | None[source]#
property summary: str[source]#

User-facing type summary (without the feature name).

to_json_dict() dict[source]#

Converts this instance to a JSON-serializable dictionary.

tanat.metadata.feature.build_feature_metadata(lf: LazyFrame) list[FeatureInfo][source]#

Compute semantic metadata for every column in lf.

A sample of 10 000 rows is used for statistics so it stays fast on large datasets.

Returns:

List of FeatureInfo instances in alphabetical order.

tanat.metadata.feature.get_feature_info_class(dtype: DataType) type[FeatureInfo][source]#

Returns the appropriate FeatureInfo class for a given Polars DataType.

tanat.metadata.sequence module#

Sequence Metadata definitions.

class tanat.metadata.sequence.SequenceMetadata(seq_id: DataType, time_index: TimeIndexInfo, entity_features: list[FeatureInfo], static_features: list[FeatureInfo] | None)[source]#

Bases: object

Rich Sequence metadata with semantic profiling.

Feature lists (entity_features, static_features) are guaranteed to be in alphabetical order.

__init__(seq_id: DataType, time_index: TimeIndexInfo, entity_features: list[FeatureInfo], static_features: list[FeatureInfo] | None) None[source]#
assert_features_compatible_with(other: SequenceMetadata, alias: str, *, context: str = 'Features must be compatible for merging.') list[str][source]#

Check that other exposes at least all entity features declared in self, with matching dtypes.

Parameters:
  • other – Metadata of the sequence or pool being compared.

  • alias – Label used in error messages to identify other.

  • context – Sentence appended to each error message.

Returns:

List of feature names present in other but absent in self (extras).

Raises:
  • ValueError – If other is missing a feature present in self.

  • TypeError – If a shared feature has an incompatible dtype.

assert_id_compatible_with(other: SequenceMetadata, alias: str, *, context: str = 'ID dtypes must match.') None[source]#

Raises TypeError if other has a different ID column dtype.

Parameters:
  • other – Metadata of the sequence or pool being compared.

  • alias – Label used in the error message to identify other.

  • context – Sentence appended to the error message.

Raises:

TypeError – If the ID dtypes differ.

assert_time_index_compatible_with(other: SequenceMetadata, alias: str, *, context: str = 'Temporal schemas must match.') None[source]#

Raises TypeError if other has an incompatible time index schema.

Two time index schemas are compatible when they share the same Datetime unit and time_zone (or identical numeric dtype for timestep sequences). Range information (min/max) is not checked.

Parameters:
  • other – Metadata of the sequence or pool being compared.

  • alias – Label used in the error message to identify other.

  • context – Sentence appended to the error message.

Raises:

TypeError – If the time index schemas are incompatible.

entity_features: list[FeatureInfo][source]#
feature_info(name: str, is_static: bool = False) FeatureInfo | None[source]#

Return the FeatureInfo for name, or None.

Parameters:
  • name – Feature name to look up.

  • is_staticTrue for static features, False for entity features.

Returns:

The matching FeatureInfo instance, or None if not found.

classmethod infer_entity_features(entity_lf: LazyFrame | None) list[FeatureInfo][source]#

Returns the list of FeatureInfo for entity_lf, or [] if None.

classmethod infer_static_features(static_lf: LazyFrame | None) list[FeatureInfo] | None[source]#

Returns the list of FeatureInfo for static_lf, or None if None.

classmethod infer_time_index(time_index: LazyFrame) TimeIndexInfo[source]#

Returns a TimeIndexInfo built from the time index LazyFrame.

is_categorical_feature(name: str, is_static: bool = False) bool[source]#

Returns True if name is a Categorical or Enum feature.

Parameters:
  • name – Feature name to check.

  • is_staticTrue for static features, False for entity features.

Raises:

KeyError – If the feature name is not found.

Examples:

>>> pool.metadata.is_categorical_feature("status")
True
is_datetime_feature(name: str, is_static: bool = False) bool[source]#

Returns True if name is a pl.Datetime or pl.Date feature (not a Duration).

Parameters:
  • name – Feature name to check.

  • is_staticTrue for static features, False for entity features.

Raises:

KeyError – If the feature name is not found.

Examples:

>>> pool.metadata.is_datetime_feature("discharge_time")
True
is_duration_feature(name: str, is_static: bool = False) bool[source]#

Returns True if name is a pl.Duration feature.

Parameters:
  • name – Feature name to check.

  • is_staticTrue for static features, False for entity features.

Raises:

KeyError – If the feature name is not found.

Examples:

>>> pool.metadata.is_duration_feature("los")
True
is_numeric_feature(name: str, is_static: bool = False) bool[source]#

Returns True if name is a numeric (integer or float) feature.

Parameters:
  • name – Feature name to check.

  • is_staticTrue for static features, False for entity features.

Raises:

KeyError – If the feature name is not found.

Examples:

>>> pool.metadata.is_numeric_feature("duration_hrs")
True
scope(entity_features: list[str] | None = None, static_features: list[str] | None = None) SequenceMetadata[source]#

Return a new metadata restricted to the given feature subsets.

Parameters:
  • entity_features – Feature names to keep. None keeps all.

  • static_features – Feature names to keep. None keeps all. An empty list produces static_features=None.

Returns:

A filtered copy, or self when nothing was actually removed. Original feature order is preserved.

seq_id: DataType[source]#
static_features: list[FeatureInfo] | None[source]#
time_index: TimeIndexInfo[source]#
to_json_dict() dict[source]#

Converts metadata to a JSON-serializable dictionary.

class tanat.metadata.sequence.TimeIndexInfo(dtype: str, is_datetime: bool, min: Any | None, max: Any | None, unit: str | None = None, time_zone: str | None = None)[source]#

Bases: object

Metadata for the sequence time index (time columns).

__init__(dtype: str, is_datetime: bool, min: Any | None, max: Any | None, unit: str | None = None, time_zone: str | None = None) None[source]#
dtype: str[source]#
classmethod from_lazyframe(lf: LazyFrame) TimeIndexInfo[source]#

Factory: builds a TimeIndexInfo by inspecting all columns of the time index LazyFrame.

Validates that: - Every column is a supported type (pl.Datetime, pl.Date,

or a numeric type, integer or float for discrete timesteps).

  • All columns share the same base type (no mix of Datetime start with Date end, for example).

Also computes the global min/max across all time index columns.

Raises:

TypeError – If a column has an unsupported type or if column types are inconsistent.

is_datetime: bool[source]#
is_schema_compatible(other: TimeIndexInfo) bool[source]#

Returns True if other has the same schema as this instance.

Only the structural fields are compared (dtype, is_datetime, unit, time_zone). Range fields (min, max) are intentionally ignored. Different stores can cover different time periods and still be joinable.

max: Any | None[source]#
min: Any | None[source]#
time_zone: str | None = None[source]#
to_json_dict() dict[source]#

Converts this instance to a JSON-serializable dictionary.

unit: str | None = None[source]#

tanat.metadata.trajectory module#

Trajectory Metadata definitions.

class tanat.metadata.trajectory.TrajectoryMetadata(traj_id: DataType, time_index: TimeIndexInfo, static_features: list[FeatureInfo] | None)[source]#

Bases: object

Rich Trajectory metadata with semantic profiling.

traj_id[source]#

Polars DataType of the trajectory ID column.

Type:

polars.datatypes.classes.DataType

time_index[source]#

Aggregated time index info across all linked stores. A TrajectoryStore always has at least one linked store, so this field is never None.

Type:

tanat.metadata.sequence.TimeIndexInfo

static_features[source]#

List of FeatureInfo for each static feature (alphabetical order), or None if none exist.

Type:

list[tanat.metadata.feature.FeatureInfo] | None

__init__(traj_id: DataType, time_index: TimeIndexInfo, static_features: list[FeatureInfo] | None) None[source]#
classmethod infer_static(lf: LazyFrame | None) list[FeatureInfo] | None[source]#

Returns the list of FeatureInfo for lf, or None if lf is None.

classmethod infer_time_index(seq_stores: dict[str, SequenceStore]) TimeIndexInfo[source]#

Aggregates the time index range across all linked stores by taking the global min/max.

Raises:

ValueError – If seq_stores is empty (should never happen for a valid TrajectoryStore).

is_categorical_feature(name: str) bool[source]#

Returns True if name is a Categorical or Enum static feature.

Parameters:

name – Feature name to check.

Raises:

KeyError – If the feature name is not found.

Examples:

>>> traj.metadata.is_categorical_feature("group")
True
scope(static_features: list[str] | None = None) TrajectoryMetadata[source]#

Return a new metadata restricted to the given static feature subset.

Parameters:

static_features – Feature names to keep. None keeps all. An empty list produces static_features=None.

Returns:

A filtered copy, or self when nothing was actually removed. Original feature order is preserved.

static_features: list[FeatureInfo] | None[source]#
time_index: TimeIndexInfo[source]#
to_json_dict() dict[source]#

Converts metadata to a JSON-serializable dictionary.

traj_id: DataType[source]#

Module contents#

Package stub.