Note
Go to the end to download the full example code.
Exploring learner activity sequences#
Scenario: You have interaction logs from a Moodle LMS and want to understand how learners engage with course material.
Concepts covered:
Load an event log with
access()Detect learning sessions from inactivity gaps
Build a
StateSequencePoolwithbuild_states()Filter sequences by length with
LengthCriterionVisualise action distributions, timelines, and state distributions
Imports#
import random
import pandas as pd
import polars as pl
from tanat import build_states
from tanat.criterion import LengthCriterion
from tanat.dataset import access
from tanat.visualization import SequenceVisualizer
Load and prepare the event log#
access() returns the MOOC dataset as a pandas
DataFrame. Each row is a single learner interaction recorded by a Moodle
LMS (~100 k events, ~118 learners).
df = access("mooc_events")
print(f"{len(df)} events · {df['user'].nunique()} learners")
df.head()
mooc_events.csv: 0%| | 0.00/14.6M [00:00<?, ?B/s]
mooc_events.csv: 1%| | 90.1k/14.6M [00:00<00:19, 726kB/s]
mooc_events.csv: 1%| | 180k/14.6M [00:00<00:23, 611kB/s]
mooc_events.csv: 2%|▏ | 303k/14.6M [00:00<00:18, 768kB/s]
mooc_events.csv: 3%|▎ | 434k/14.6M [00:00<00:18, 784kB/s]
mooc_events.csv: 4%|▍ | 549k/14.6M [00:00<00:16, 826kB/s]
mooc_events.csv: 5%|▍ | 713k/14.6M [00:00<00:15, 895kB/s]
mooc_events.csv: 6%|▌ | 827k/14.6M [00:00<00:15, 900kB/s]
mooc_events.csv: 6%|▋ | 926k/14.6M [00:01<00:17, 797kB/s]
mooc_events.csv: 7%|▋ | 1.07M/14.6M [00:01<00:17, 785kB/s]
mooc_events.csv: 9%|▉ | 1.29M/14.6M [00:01<00:14, 896kB/s]
mooc_events.csv: 10%|▉ | 1.45M/14.6M [00:01<00:14, 882kB/s]
mooc_events.csv: 11%|█▏ | 1.66M/14.6M [00:01<00:13, 955kB/s]
mooc_events.csv: 13%|█▎ | 1.86M/14.6M [00:02<00:13, 975kB/s]
mooc_events.csv: 14%|█▍ | 2.06M/14.6M [00:02<00:12, 990kB/s]
mooc_events.csv: 16%|█▌ | 2.29M/14.6M [00:02<00:11, 1.05MB/s]
mooc_events.csv: 17%|█▋ | 2.50M/14.6M [00:02<00:11, 1.07MB/s]
mooc_events.csv: 18%|█▊ | 2.61M/14.6M [00:02<00:11, 1.08MB/s]
mooc_events.csv: 19%|█▉ | 2.78M/14.6M [00:02<00:11, 1.00MB/s]
mooc_events.csv: 20%|██ | 2.97M/14.6M [00:03<00:11, 997kB/s]
mooc_events.csv: 21%|██▏ | 3.13M/14.6M [00:03<00:12, 951kB/s]
mooc_events.csv: 23%|██▎ | 3.34M/14.6M [00:03<00:11, 999kB/s]
mooc_events.csv: 24%|██▍ | 3.56M/14.6M [00:03<00:10, 1.03MB/s]
mooc_events.csv: 26%|██▌ | 3.74M/14.6M [00:03<00:10, 1.00MB/s]
mooc_events.csv: 27%|██▋ | 3.95M/14.6M [00:04<00:10, 1.03MB/s]
mooc_events.csv: 28%|██▊ | 4.15M/14.6M [00:04<00:10, 1.03MB/s]
mooc_events.csv: 30%|██▉ | 4.37M/14.6M [00:04<00:09, 1.08MB/s]
mooc_events.csv: 31%|███ | 4.52M/14.6M [00:04<00:10, 982kB/s]
mooc_events.csv: 32%|███▏ | 4.72M/14.6M [00:04<00:09, 993kB/s]
mooc_events.csv: 34%|███▎ | 4.92M/14.6M [00:05<00:09, 1.00MB/s]
mooc_events.csv: 35%|███▍ | 5.10M/14.6M [00:05<00:09, 982kB/s]
mooc_events.csv: 36%|███▌ | 5.28M/14.6M [00:05<00:09, 968kB/s]
mooc_events.csv: 37%|███▋ | 5.46M/14.6M [00:05<00:09, 958kB/s]
mooc_events.csv: 39%|███▊ | 5.62M/14.6M [00:05<00:09, 926kB/s]
mooc_events.csv: 40%|███▉ | 5.80M/14.6M [00:06<00:09, 929kB/s]
mooc_events.csv: 41%|████ | 5.98M/14.6M [00:06<00:09, 931kB/s]
mooc_events.csv: 42%|████▏ | 6.16M/14.6M [00:06<00:09, 931kB/s]
mooc_events.csv: 44%|████▍ | 6.39M/14.6M [00:06<00:08, 1.01MB/s]
mooc_events.csv: 45%|████▌ | 6.57M/14.6M [00:06<00:08, 984kB/s]
mooc_events.csv: 46%|████▌ | 6.73M/14.6M [00:07<00:08, 943kB/s]
mooc_events.csv: 47%|████▋ | 6.88M/14.6M [00:07<00:08, 891kB/s]
mooc_events.csv: 49%|████▊ | 7.08M/14.6M [00:07<00:08, 931kB/s]
mooc_events.csv: 50%|████▉ | 7.27M/14.6M [00:07<00:07, 944kB/s]
mooc_events.csv: 51%|█████▏ | 7.48M/14.6M [00:07<00:07, 992kB/s]
mooc_events.csv: 53%|█████▎ | 7.71M/14.6M [00:07<00:06, 1.05MB/s]
mooc_events.csv: 54%|█████▎ | 7.82M/14.6M [00:08<00:06, 1.05MB/s]
mooc_events.csv: 55%|█████▍ | 7.95M/14.6M [00:08<00:07, 942kB/s]
mooc_events.csv: 56%|█████▌ | 8.18M/14.6M [00:08<00:06, 1.02MB/s]
mooc_events.csv: 57%|█████▋ | 8.38M/14.6M [00:08<00:06, 1.02MB/s]
mooc_events.csv: 59%|█████▉ | 8.59M/14.6M [00:08<00:05, 1.05MB/s]
mooc_events.csv: 60%|██████ | 8.76M/14.6M [00:09<00:05, 985kB/s]
mooc_events.csv: 62%|██████▏ | 8.99M/14.6M [00:09<00:05, 1.05MB/s]
mooc_events.csv: 63%|██████▎ | 9.18M/14.6M [00:09<00:05, 1.04MB/s]
mooc_events.csv: 64%|██████▍ | 9.35M/14.6M [00:09<00:05, 983kB/s]
mooc_events.csv: 66%|██████▌ | 9.56M/14.6M [00:09<00:04, 1.02MB/s]
mooc_events.csv: 67%|██████▋ | 9.77M/14.6M [00:10<00:04, 1.05MB/s]
mooc_events.csv: 68%|██████▊ | 9.94M/14.6M [00:10<00:04, 987kB/s]
mooc_events.csv: 69%|██████▉ | 10.1M/14.6M [00:10<00:04, 998kB/s]
mooc_events.csv: 71%|███████ | 10.3M/14.6M [00:10<00:04, 953kB/s]
mooc_events.csv: 72%|███████▏ | 10.5M/14.6M [00:10<00:04, 1.00MB/s]
mooc_events.csv: 73%|███████▎ | 10.7M/14.6M [00:10<00:03, 1.18MB/s]
mooc_events.csv: 75%|███████▍ | 10.9M/14.6M [00:11<00:03, 1.07MB/s]
mooc_events.csv: 76%|███████▌ | 11.1M/14.6M [00:11<00:03, 1.02MB/s]
mooc_events.csv: 77%|███████▋ | 11.2M/14.6M [00:11<00:03, 994kB/s]
mooc_events.csv: 78%|███████▊ | 11.4M/14.6M [00:11<00:03, 933kB/s]
mooc_events.csv: 79%|███████▉ | 11.6M/14.6M [00:11<00:03, 934kB/s]
mooc_events.csv: 81%|████████ | 11.7M/14.6M [00:12<00:03, 933kB/s]
mooc_events.csv: 82%|████████▏ | 11.9M/14.6M [00:12<00:02, 907kB/s]
mooc_events.csv: 83%|████████▎ | 12.1M/14.6M [00:12<00:02, 966kB/s]
mooc_events.csv: 85%|████████▍ | 12.4M/14.6M [00:12<00:02, 1.08MB/s]
mooc_events.csv: 86%|████████▋ | 12.6M/14.6M [00:12<00:01, 1.06MB/s]
mooc_events.csv: 88%|████████▊ | 12.8M/14.6M [00:13<00:01, 1.03MB/s]
mooc_events.csv: 89%|████████▊ | 12.9M/14.6M [00:13<00:01, 948kB/s]
mooc_events.csv: 90%|████████▉ | 13.1M/14.6M [00:13<00:01, 969kB/s]
mooc_events.csv: 92%|█████████▏| 13.4M/14.6M [00:13<00:01, 1.06MB/s]
mooc_events.csv: 93%|█████████▎| 13.5M/14.6M [00:13<00:01, 1.02MB/s]
mooc_events.csv: 94%|█████████▍| 13.7M/14.6M [00:13<00:00, 998kB/s]
mooc_events.csv: 95%|█████████▌| 13.9M/14.6M [00:14<00:00, 978kB/s]
mooc_events.csv: 97%|█████████▋| 14.1M/14.6M [00:14<00:00, 986kB/s]
mooc_events.csv: 98%|█████████▊| 14.3M/14.6M [00:14<00:00, 1.02MB/s]
mooc_events.csv: 99%|█████████▉| 14.5M/14.6M [00:14<00:00, 1.02MB/s]
mooc_events.csv: 100%|██████████| 14.6M/14.6M [00:14<00:00, 988kB/s]
95626 events · 130 learners
Step 1: Session detection#
Learning sessions are not labelled in the log. We define a session as a continuous period of activity: a new session begins when the same learner is idle for more than 2 hours, or when a different user appears.
Each session receives a unique integer id that will serve as the sequence identifier in TanaT.
INACTIVITY = pd.Timedelta("2h")
df["timecreated"] = pd.to_datetime(df["timecreated"])
df = df.sort_values(["user", "timecreated"])
df["session"] = (
(df["user"] != df["user"].shift()) | (df["timecreated"].diff() > INACTIVITY)
).cumsum()
print(f"Detected {df['session'].nunique()} sessions")
Detected 5700 sessions
Static table: one row per session with the learner identifier.
sessions = df[["user", "session"]].drop_duplicates()
Step 2: Build the sequence pool#
Each session becomes one sequence. We use
build_states() with a within-session position index
as the time axis (0 = first event, 1 = second, …). This abstracts away
calendar time and focuses on the order of actions.
The sessions table (one row per session) is passed as static_data
so the learner identifier is attached to each sequence.
# Add a within-session position index.
df["position"] = df.groupby("session").cumcount()
pool = build_states(
df[["session", "position", "Action"]],
id_column="session",
start_column="position",
static_data=sessions,
store_name="mooc_sessions_store",
)
# ``pl.Categorical`` enables consistent colour-coding across visualisations
# and is required by the metric module.
pool.cast_features({"Action": pl.Categorical}, is_static=False)
┌─ State SequenceStore
│
│ Step 1/4: Sorting & preparing data
│
│ Step 2/4: Building sequence index
│
│ Step 3/4: Writing entity, time index & static features
│
│ Step 4/4: Computing & writing metadata
│
└─ Done (5,700 sequences · 95,626 entities · 0.03s)
print(pool)
┌────────────────────────────────────────────────┐
│ StateSequencePool Summary │
└────────────────────────────────────────────────┘
Overview
─────────────────────────
Sequences 5,700
Store /home/runner/.tanat_workspace/building_pools_tutorial/mooc_sessions_store
id_column session
Time Index
─────────────────────────
Type Int64 (Timestep) [0 → 152]
Columns ['position', 'end']
t0 position=0, anchor=start
Entity Features (1)
─────────────────────────
• Action Categorical (12 categories)
Static Features (1)
─────────────────────────
• user String [len 9 → 9]
Step 3: Filter by length#
The session length distribution is skewed: some outlier sessions contain hundreds of events. We keep sessions with 2 to 40 actions, which covers the majority of learners while removing single-click noise and unrealistically long sessions.
ids_keep = pool.which(LengthCriterion(ge=2, le=40))
pool_filtered = pool.subset(ids_keep)
[which] LengthCriterion → 5,150 / 5,700 IDs (90.4%)
print(pool_filtered)
┌────────────────────────────────────────────────┐
│ StateSequencePool Summary │
└────────────────────────────────────────────────┘
Overview
─────────────────────────
Sequences 5,150
Store /home/runner/.tanat_workspace/building_pools_tutorial/mooc_sessions_store
id_column session
Time Index
─────────────────────────
Type Int64 (Timestep) [0 → 39]
Columns ['position', 'end']
t0 position=0, anchor=start
Entity Features (1)
─────────────────────────
• Action Categorical (12 categories)
Static Features (1)
─────────────────────────
• user String [len 9 → 9]
Step 4: Action distribution#
A bar plot shows the frequency of each action type across all sessions, giving a first overview of what learners do most.
# fmt: off
SequenceVisualizer.barplot(sort="descending") \
.title("Action type distribution") \
.draw(pool_filtered, entity_feature="Action") \
.show()
# fmt: on

Step 5: Sample timeline#
We draw 30 random sessions side by side. Each row is one session; each coloured block is one action at a given position.
random.seed(42)
sample_ids = random.sample(sorted(pool_filtered.unique_ids), 30)
sample = pool_filtered.subset(sample_ids)
# fmt: off
SequenceVisualizer.timeline() \
.title("30 random learning sessions") \
.x_axis(label="Position in session") \
.draw(sample, entity_feature="Action") \
.show()
# fmt: on

/opt/hostedtoolcache/Python/3.13.13/x64/lib/python3.13/site-packages/tanat_utils/caching/cachable.py:133: UserWarning: 30 row(s) have a null time index (__END__) and will be excluded from the visualisation.
value = method(self, *args, **kwargs)
Step 6: State distribution over position#
The distribution plot shows how action proportions shift across positions, revealing how learners typically start and end their sessions.
# fmt: off
SequenceVisualizer.distribution(bin_size=1) \
.title("Action distribution over session progress") \
.x_axis(label="Position in session") \
.draw(pool_filtered, entity_feature="Action") \
.show()
# fmt: on

/opt/hostedtoolcache/Python/3.13.13/x64/lib/python3.13/site-packages/tanat_utils/caching/cachable.py:133: UserWarning: 5150 row(s) have a null time index (__END__) and will be excluded from the visualisation.
value = method(self, *args, **kwargs)
Total running time of the script: (0 minutes 16.813 seconds)