tanat.criterion.type package#

Submodules#

tanat.criterion.type.entity module#

EntityCriterion: filter entities/sequences by a Polars expression.

Compatibility: ENTITY, SEQUENCE.

class tanat.criterion.type.entity.EntityCriterion(query: Expr)[source]#

Bases: Criterion

Filter entities or select sequences using a Polars expression.

Supported levels: ENTITY, SEQUENCE.

Example:

# entity-level pruning (keep only rows where diag_type == "DP")
pool2 = pool.filter_entities(EntityCriterion(query=pl.col("diag_type") == "DP"))

# sequence selection: IDs that have at least one such row
ids = pool.which(EntityCriterion(query=pl.col("diag_type") == "DP"))

# single sequence match
ok = seq.match(EntityCriterion(query=pl.col("diag_type") == "DP"))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY, CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of EntityCriterionSettings

__init__(query: Expr) → None[source]#

class tanat.criterion.type.entity.EntityCriterionSettings(*, query: pl.Expr)[source]#

Bases: object

Settings for EntityCriterion.

Parameters:: query – A polars.Expr used for lazy filtering.

__init__(*args: Any, **kwargs: Any) → None[source]#

model_dump(*, mode='python', **dump_kwargs)[source]#: Dump settings to a dict via Pydantic serialization.

query: Expr[source]#

tanat.criterion.type.length module#

LengthCriterion: filter sequences by number of entities.

Compatibility: SEQUENCE only.

class tanat.criterion.type.length.LengthCriterion(*, gt: int | None = None, ge: int | None = None, lt: int | None = None, le: int | None = None)[source]#

Bases: Criterion

Select sequences by their number of entities (rows).

Supported levels: SEQUENCE.

Example:

# sequences with more than 5 entities
ids = pool.which(LengthCriterion(gt=5))
pool2 = pool.subset(ids)

# a single sequence
ok = seq.match(LengthCriterion(ge=3, lt=20))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of LengthCriterionSettings

__init__(*, gt: int | None = None, ge: int | None = None, lt: int | None = None, le: int | None = None) → None[source]#

class tanat.criterion.type.length.LengthCriterionSettings(*, gt: int | None = None, ge: int | None = None, lt: int | None = None, le: int | None = None)[source]#

Bases: object

Validation settings for LengthCriterion.

At least one bound must be provided. Contradictory bounds are rejected at construction time.

Parameters:

gt – Strictly greater than.
ge – Greater than or equal to.
lt – Strictly less than.
le – Less than or equal to.

__init__(*args: Any, **kwargs: Any) → None[source]#

ge: int | None = None[source]#

gt: int | None = None[source]#

le: int | None = None[source]#

lt: int | None = None[source]#

model_dump(*, mode='python', **dump_kwargs)[source]#: Dump settings to a dict via Pydantic serialization.

tanat.criterion.type.pattern module#

PatternCriterion: filter sequences by an ordered pattern of string values.

Compatibility: ENTITY, SEQUENCE.

tanat.criterion.type.pattern.ANY = '...'[source]#: Matches zero or more elements — free gap between adjacent segments.

class tanat.criterion.type.pattern.PatternCriterion(feature: str, pattern: str | list[str], present: bool = True, regex: bool = True, case_sensitive: bool = True)[source]#

Bases: Criterion

Filter entities or sequences by an ordered pattern of string values.

A sequence matches when its entities (in temporal order) contain the given pattern as an ordered sub-sequence: pattern element k must appear after element k-1 in the sequence.

Supported levels: ENTITY, SEQUENCE.

Entity level (filter_entities):

present=True: keeps only the rows that are “witnesses” of the greedy first match. Sequences without a complete match → 0 rows.
present=False: keeps all rows that are not witnesses (rows that don’t participate in the pattern). Sequences without a complete match → all their rows are kept.

Sequence level (which, match):

Keeps (or excludes, with present=False) whole sequences based on whether the ordered pattern is found.

Example:

# IDs where "A" appears directly before "B" (adjacent)
ids = pool.which(PatternCriterion(feature="code", pattern=["A", "B"]))

# Entity pruning: keep only the matched witness rows
pool2 = pool.filter_entities(
    PatternCriterion(feature="code", pattern=["A", "B"])
)

# Free gap: A before B with any rows in between
ids = pool.which(
    PatternCriterion(feature="code", pattern=["A", ANY, "B"])
)

# Exactly one element between A and B
ids = pool.which(
    PatternCriterion(feature="code", pattern=["A", WILDCARD, "B"])
)

# Single-element pattern: at least one row matching "ICU"
ids = pool.which(PatternCriterion(feature="label", pattern="ICU"))

# Exclusion: sequences that never contain adjacent A→B
ids = pool.which(
    PatternCriterion(feature="code", pattern=["A", "B"], present=False)
)

# Literal, case-insensitive
ids = pool.which(
    PatternCriterion(feature="code", pattern="icu", regex=False, case_sensitive=False)
)

# Single-sequence match
ok = seq.match(PatternCriterion(feature="code", pattern=["A", "B"]))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY, CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of PatternCriterionSettings

__init__(feature: str, pattern: str | list[str], present: bool = True, regex: bool = True, case_sensitive: bool = True) → None[source]#

class tanat.criterion.type.pattern.PatternCriterionSettings(*, feature: str, pattern: str | list[str], present: bool = True, regex: bool = True, case_sensitive: bool = True)[source]#

Bases: object

Settings for PatternCriterion.

Parameters:

feature – Name of the string feature to match against.
pattern –
Ordered pattern to match against the feature column.

Elements are matched adjacently (consecutively) within each segment. Use ANY ("...") to introduce a free gap of any length between segments:
- ["A", "B"] — A directly followed by B.
- ["A", WILDCARD, "B"] — A, then exactly one element, then B.
- ["A", ANY, "B"] — A before B with any number of rows in between.
- ["A", ANY, "B", "C"] — A anywhere, then B directly followed by C.
A plain str is shorthand for a single-element list.
present –
When True (default), the pattern must be present:
- Sequence level: sequences that contain the pattern are selected.
- Entity level: the “witness” rows of the greedy first match are kept.
When False, the pattern must be absent:
- Sequence level: sequences that do not contain the pattern are selected.
- Entity level: all rows that are not witnesses are kept (rows that don’t participate in the pattern match).
regex – If True (default), each non-sentinel element of pattern is treated as a regular expression. Set to False for literal substring matching. WILDCARD and ANY always behave as sentinels regardless of this flag.
case_sensitive – Case-sensitive matching (default: True).

__init__(*args: Any, **kwargs: Any) → None[source]#

case_sensitive: bool = True[source]#

feature: str[source]#

model_dump(*, mode='python', **dump_kwargs)[source]#: Dump settings to a dict via Pydantic serialization.

pattern: str | list[str][source]#

present: bool = True[source]#

regex: bool = True[source]#

tanat.criterion.type.pattern.WILDCARD = '*'[source]#: Matches exactly one non-null value at that position (any value).

tanat.criterion.type.rank module#

RankCriterion: select entities by their positional rank within a sequence.

Compatibility: ENTITY only.

Bases: Criterion

Select entities by their positional rank within a sequence.

Supported levels: ENTITY only.

Entities are numbered 0-based within each sequence in their natural store order. Negative indices use Python-style semantics (from the end).

Example:

# keep the first 3 entities
pool2 = pool.filter_entities(RankCriterion(first=3))

# keep all except the last 2 entities
pool2 = pool.filter_entities(RankCriterion(first=-2))

# keep the last 2 entities
pool2 = pool.filter_entities(RankCriterion(last=2))

# keep all except the first 3 entities
pool2 = pool.filter_entities(RankCriterion(last=-3))

# keep entities at ranks 2, 3, 4 (Python slice semantics)
pool2 = pool.filter_entities(RankCriterion(start=2, end=5))

# keep from rank 5 to 2nd-from-end
pool2 = pool.filter_entities(RankCriterion(start=5, end=-2))

# keep every other entity
pool2 = pool.filter_entities(RankCriterion(step=2))

# keep specific ranks (first and last)
pool2 = pool.filter_entities(RankCriterion(ranks=[0, -1]))

# keep the 3 entities centered on T0 (requires set_t0())
pool2 = pool.filter_entities(RankCriterion(start=-1, end=2, relative=True))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of RankCriterionSettings

__init__(*, first: int | None = None, last: int | None = None, start: int | None = None, end: int | None = None, step: int | None = None, ranks: list[int] | int | None = None, relative: bool = False) → None[source]#

Bases: object

Settings for RankCriterion.

Select entities by their 0-based position within each sequence. Exactly one parameter group must be specified: first or last or start/end/step or ranks.

Parameters:

first – Keep the first N entities. - N > 0: keep first N (like [:N]). - N < 0: keep all except last |N| (like [:-|N|]). - Cannot be 0.
last – Keep the last N entities. - N > 0: keep last N (like [-N:]). - N < 0: keep all except first |N| (like [|N|:]). - Cannot be 0.
start – Start rank (inclusive, 0-based). Python-style negative indexing supported (e.g. -5 = 5th from the end).
end – End rank (exclusive, 0-based). Python-style negative indexing supported (e.g. -2 = 2nd from the end).
step – Sub-sample every N-th entity (must be >= 1). Only compatible with start/end.
ranks – Specific ranks to keep (0-based, negative = from end). A single int is accepted and coerced to a one-element list.
relative – If True, interpret ranks relative to T0 (nearest entity to time 0).

__init__(*args: Any, **kwargs: Any) → None[source]#

end: int | None = None[source]#

first: int | None = None[source]#

last: int | None = None[source]#

model_dump(*, mode='python', **dump_kwargs)[source]#: Dump settings to a dict via Pydantic serialization.

ranks: list[int] | None = None[source]#

relative: bool = False[source]#

start: int | None = None[source]#

step: int | None = None[source]#

tanat.criterion.type.static module#

StaticCriterion: filter sequences/trajectories by a static-data expression.

Compatibility: SEQUENCE, TRAJECTORY.

class tanat.criterion.type.static.StaticCriterion(query: Expr)[source]#

Bases: Criterion

Select sequences or trajectories using a static-feature expression.

Supported levels: SEQUENCE, TRAJECTORY.

Example:

# sequence pool: keep IDs where age > 50
ids = seq_pool.which(StaticCriterion(query=pl.col("age") > 50))
seq_pool2 = seq_pool.subset(ids)

# trajectory pool
ids = traj_pool.which(StaticCriterion(query=pl.col("group") == "A"))
traj_pool2 = traj_pool.subset(ids)

# single match
ok = seq.match(StaticCriterion(query=pl.col("age") > 50))
ok = traj.match(StaticCriterion(query=pl.col("group") == "A"))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.SEQUENCE, CriterionLevel.TRAJECTORY})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of StaticCriterionSettings

__init__(query: Expr) → None[source]#

class tanat.criterion.type.static.StaticCriterionSettings(*, query: pl.Expr)[source]#

Bases: object

Settings for StaticCriterion.

Parameters:: query – A polars.Expr evaluated against the static data frame.

__init__(*args: Any, **kwargs: Any) → None[source]#

model_dump(*, mode='python', **dump_kwargs)[source]#: Dump settings to a dict via Pydantic serialization.

query: Expr[source]#

tanat.criterion.type.time module#

TimeCriterion: filter entities/sequences by temporal bounds.

Compatibility: ENTITY, SEQUENCE.

Bases: Criterion

Filter entities or select sequences by temporal position.

Supported levels: ENTITY, SEQUENCE.

Example:

import datetime as dt
from tanat.criterion import TimeCriterion

t0 = dt.datetime(2020, 1, 1)
t1 = dt.datetime(2021, 1, 1)

# entity pruning: keep rows whose start time is in [t0, t1]
pool2 = pool.filter_entities(TimeCriterion(start_ge=t0, start_le=t1))

# entity pruning: interval must be fully contained in [t0, t1]
pool3 = pool.filter_entities(
    TimeCriterion(start_ge=t0, end_le=t1, duration_within=True)
)

# sequence selection: IDs with at least one row in the window (default)
ids = pool.which(TimeCriterion(start_ge=t0))

# sequence selection: IDs where ALL rows are in the window
ids = pool.which(TimeCriterion(start_ge=t0, end_le=t1, all_entities=True))

# match
ok = seq.match(TimeCriterion(start_le=t1))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY, CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of TimeCriterionSettings

__init__(*, start_ge: datetime | date | int | float | None = None, start_le: datetime | date | int | float | None = None, end_ge: datetime | date | int | float | None = None, end_le: datetime | date | int | float | None = None, duration_within: bool = False, all_entities: bool = False) → None[source]#

class tanat.criterion.type.time.TimeCriterionSettings(*, start_ge: TimeBound | None = None, start_le: TimeBound | None = None, end_ge: TimeBound | None = None, end_le: TimeBound | None = None, duration_within: bool = False, all_entities: bool = False)[source]#

Bases: object

Settings for TimeCriterion.

Temporal bounds are inclusive on both sides. At least one of start_ge / start_le / end_ge / end_le must be provided.

Parameters:

start_ge – Minimum value for the start time column (inclusive).
start_le – Maximum value for the start time column (inclusive).
end_ge – Minimum value for the end time column (inclusive). Only applicable to interval/state sequences (two time columns).
end_le – Maximum value for the end time column (inclusive). Only applicable to interval/state sequences (two time columns).
duration_within – Entity-level (two time columns only). When True, the entity interval must be fully contained within the bounds (start >= start_ge AND end <= end_le). When False (default), any overlap is sufficient. Ignored for single-column (event) sequences.
all_entities – Sequence-level. When True, every entity row of the sequence must satisfy the time filter (sequence is fully within the window). When False (default), at least one entity row must match.

__init__(*args: Any, **kwargs: Any) → None[source]#

all_entities: bool = False[source]#

duration_within: bool = False[source]#

end_ge: datetime | date | int | float | None = None[source]#

end_le: datetime | date | int | float | None = None[source]#

model_dump(*, mode='python', **dump_kwargs)[source]#: Dump settings to a dict via Pydantic serialization.

start_ge: datetime | date | int | float | None = None[source]#

start_le: datetime | date | int | float | None = None[source]#

validate_against(time_index: TimeIndexInfo) → None[source]#

Check that the bounds are compatible with time_index.

Raises:: TypeError – If the bound Python type is inconsistent with the sequence time index (e.g. numeric bound for a datetime index or vice-versa).

Module contents#

Criterion type sub-package.

class tanat.criterion.type.EntityCriterion(query: Expr)[source]#

Bases: Criterion

Filter entities or select sequences using a Polars expression.

Supported levels: ENTITY, SEQUENCE.

Example:

# entity-level pruning (keep only rows where diag_type == "DP")
pool2 = pool.filter_entities(EntityCriterion(query=pl.col("diag_type") == "DP"))

# sequence selection: IDs that have at least one such row
ids = pool.which(EntityCriterion(query=pl.col("diag_type") == "DP"))

# single sequence match
ok = seq.match(EntityCriterion(query=pl.col("diag_type") == "DP"))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY, CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of EntityCriterionSettings

__init__(query: Expr) → None[source]#

class tanat.criterion.type.LengthCriterion(*, gt: int | None = None, ge: int | None = None, lt: int | None = None, le: int | None = None)[source]#

Bases: Criterion

Select sequences by their number of entities (rows).

Supported levels: SEQUENCE.

Example:

# sequences with more than 5 entities
ids = pool.which(LengthCriterion(gt=5))
pool2 = pool.subset(ids)

# a single sequence
ok = seq.match(LengthCriterion(ge=3, lt=20))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of LengthCriterionSettings

__init__(*, gt: int | None = None, ge: int | None = None, lt: int | None = None, le: int | None = None) → None[source]#

class tanat.criterion.type.PatternCriterion(feature: str, pattern: str | list[str], present: bool = True, regex: bool = True, case_sensitive: bool = True)[source]#

Bases: Criterion

Filter entities or sequences by an ordered pattern of string values.

A sequence matches when its entities (in temporal order) contain the given pattern as an ordered sub-sequence: pattern element k must appear after element k-1 in the sequence.

Supported levels: ENTITY, SEQUENCE.

Entity level (filter_entities):

present=True: keeps only the rows that are “witnesses” of the greedy first match. Sequences without a complete match → 0 rows.
present=False: keeps all rows that are not witnesses (rows that don’t participate in the pattern). Sequences without a complete match → all their rows are kept.

Sequence level (which, match):

Keeps (or excludes, with present=False) whole sequences based on whether the ordered pattern is found.

Example:

# IDs where "A" appears directly before "B" (adjacent)
ids = pool.which(PatternCriterion(feature="code", pattern=["A", "B"]))

# Entity pruning: keep only the matched witness rows
pool2 = pool.filter_entities(
    PatternCriterion(feature="code", pattern=["A", "B"])
)

# Free gap: A before B with any rows in between
ids = pool.which(
    PatternCriterion(feature="code", pattern=["A", ANY, "B"])
)

# Exactly one element between A and B
ids = pool.which(
    PatternCriterion(feature="code", pattern=["A", WILDCARD, "B"])
)

# Single-element pattern: at least one row matching "ICU"
ids = pool.which(PatternCriterion(feature="label", pattern="ICU"))

# Exclusion: sequences that never contain adjacent A→B
ids = pool.which(
    PatternCriterion(feature="code", pattern=["A", "B"], present=False)
)

# Literal, case-insensitive
ids = pool.which(
    PatternCriterion(feature="code", pattern="icu", regex=False, case_sensitive=False)
)

# Single-sequence match
ok = seq.match(PatternCriterion(feature="code", pattern=["A", "B"]))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY, CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of PatternCriterionSettings

__init__(feature: str, pattern: str | list[str], present: bool = True, regex: bool = True, case_sensitive: bool = True) → None[source]#

Bases: Criterion

Select entities by their positional rank within a sequence.

Supported levels: ENTITY only.

Entities are numbered 0-based within each sequence in their natural store order. Negative indices use Python-style semantics (from the end).

Example:

# keep the first 3 entities
pool2 = pool.filter_entities(RankCriterion(first=3))

# keep all except the last 2 entities
pool2 = pool.filter_entities(RankCriterion(first=-2))

# keep the last 2 entities
pool2 = pool.filter_entities(RankCriterion(last=2))

# keep all except the first 3 entities
pool2 = pool.filter_entities(RankCriterion(last=-3))

# keep entities at ranks 2, 3, 4 (Python slice semantics)
pool2 = pool.filter_entities(RankCriterion(start=2, end=5))

# keep from rank 5 to 2nd-from-end
pool2 = pool.filter_entities(RankCriterion(start=5, end=-2))

# keep every other entity
pool2 = pool.filter_entities(RankCriterion(step=2))

# keep specific ranks (first and last)
pool2 = pool.filter_entities(RankCriterion(ranks=[0, -1]))

# keep the 3 entities centered on T0 (requires set_t0())
pool2 = pool.filter_entities(RankCriterion(start=-1, end=2, relative=True))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of RankCriterionSettings

__init__(*, first: int | None = None, last: int | None = None, start: int | None = None, end: int | None = None, step: int | None = None, ranks: list[int] | int | None = None, relative: bool = False) → None[source]#

class tanat.criterion.type.StaticCriterion(query: Expr)[source]#

Bases: Criterion

Select sequences or trajectories using a static-feature expression.

Supported levels: SEQUENCE, TRAJECTORY.

Example:

# sequence pool: keep IDs where age > 50
ids = seq_pool.which(StaticCriterion(query=pl.col("age") > 50))
seq_pool2 = seq_pool.subset(ids)

# trajectory pool
ids = traj_pool.which(StaticCriterion(query=pl.col("group") == "A"))
traj_pool2 = traj_pool.subset(ids)

# single match
ok = seq.match(StaticCriterion(query=pl.col("age") > 50))
ok = traj.match(StaticCriterion(query=pl.col("group") == "A"))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.SEQUENCE, CriterionLevel.TRAJECTORY})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of StaticCriterionSettings

__init__(query: Expr) → None[source]#

Bases: Criterion

Filter entities or select sequences by temporal position.

Supported levels: ENTITY, SEQUENCE.

Example:

import datetime as dt
from tanat.criterion import TimeCriterion

t0 = dt.datetime(2020, 1, 1)
t1 = dt.datetime(2021, 1, 1)

# entity pruning: keep rows whose start time is in [t0, t1]
pool2 = pool.filter_entities(TimeCriterion(start_ge=t0, start_le=t1))

# entity pruning: interval must be fully contained in [t0, t1]
pool3 = pool.filter_entities(
    TimeCriterion(start_ge=t0, end_le=t1, duration_within=True)
)

# sequence selection: IDs with at least one row in the window (default)
ids = pool.which(TimeCriterion(start_ge=t0))

# sequence selection: IDs where ALL rows are in the window
ids = pool.which(TimeCriterion(start_ge=t0, end_le=t1, all_entities=True))

# match
ok = seq.match(TimeCriterion(start_le=t1))

LEVELS: ClassVar[frozenset[CriterionLevel]] = frozenset({CriterionLevel.ENTITY, CriterionLevel.SEQUENCE})[source]#: Declare which levels this criterion supports.

SETTINGS_CLASS[source]#: alias of TimeCriterionSettings

__init__(*, start_ge: datetime | date | int | float | None = None, start_le: datetime | date | int | float | None = None, end_ge: datetime | date | int | float | None = None, end_le: datetime | date | int | float | None = None, duration_within: bool = False, all_entities: bool = False) → None[source]#