Spaces:
Sleeping
Sleeping
Sync from GitHub via hub-sync
Browse files- filter_engine.py +98 -76
- fake_dataset.csv β toy_dataset.csv +0 -0
- uv.lock +0 -0
filter_engine.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import marimo
|
| 2 |
|
| 3 |
-
__generated_with = "0.
|
| 4 |
app = marimo.App(
|
| 5 |
width="medium",
|
| 6 |
app_title="Open Syndrome Definition - Data Browser",
|
|
@@ -18,7 +18,7 @@ def _():
|
|
| 18 |
|
| 19 |
import yaml
|
| 20 |
from opensyndrome.filter import OSDEngine, load_profile
|
| 21 |
-
from opensyndrome.artifacts import
|
| 22 |
|
| 23 |
return (
|
| 24 |
OSDEngine,
|
|
@@ -35,13 +35,19 @@ def _():
|
|
| 35 |
|
| 36 |
@app.cell
|
| 37 |
def _(go, pl):
|
| 38 |
-
def plot_cases(
|
|
|
|
|
|
|
| 39 |
_definitions_columns_sum = [
|
| 40 |
pl.col(definition).sum().alias(definition) for definition in definitions
|
| 41 |
]
|
| 42 |
_agg_df = (
|
| 43 |
_df_filtered.with_columns(
|
| 44 |
-
pl.col(date_column)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
)
|
| 46 |
.group_by("_month")
|
| 47 |
.agg(_definitions_columns_sum)
|
|
@@ -94,9 +100,7 @@ def _(go, pl):
|
|
| 94 |
|
| 95 |
@app.cell
|
| 96 |
def _(mo):
|
| 97 |
-
mo.md(r"""
|
| 98 |
-
# Open Syndrome Definition π©π½βπ¬
|
| 99 |
-
""")
|
| 100 |
return
|
| 101 |
|
| 102 |
|
|
@@ -115,8 +119,8 @@ def _(mo):
|
|
| 115 |
@app.cell
|
| 116 |
def _(Path):
|
| 117 |
EXAMPLE_DATASETS = {
|
| 118 |
-
"
|
| 119 |
-
"csv": Path("
|
| 120 |
"mapping": Path("mapping.yaml"),
|
| 121 |
"date_column": "recording_ts",
|
| 122 |
},
|
|
@@ -160,11 +164,7 @@ def _(EXAMPLE_DATASETS, data_source, example_picker, pl, sample_file):
|
|
| 160 |
else None
|
| 161 |
)
|
| 162 |
else:
|
| 163 |
-
df_selected = (
|
| 164 |
-
pl.read_csv(sample_file.contents())
|
| 165 |
-
if sample_file.value
|
| 166 |
-
else None
|
| 167 |
-
)
|
| 168 |
return (df_selected,)
|
| 169 |
|
| 170 |
|
|
@@ -223,14 +223,18 @@ def _(df_selected, initial_date_column, initial_yaml, mo):
|
|
| 223 |
|
| 224 |
mo.vstack(
|
| 225 |
[
|
| 226 |
-
mo.md("###
|
| 227 |
mo.md(
|
| 228 |
"Edit the YAML below to map your dataset columns to OSD concepts, "
|
| 229 |
"then click **Submit**. "
|
| 230 |
"Select the date column separately for the time-series view.\n\n"
|
| 231 |
f"Your dataset columns: `{_cols_hint}`"
|
| 232 |
),
|
| 233 |
-
mo.hstack(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
]
|
| 235 |
)
|
| 236 |
return date_column_picker, date_format_input, yaml_editor
|
|
@@ -246,7 +250,10 @@ def _(df_selected, load_profile, mo, yaml, yaml_editor):
|
|
| 246 |
mo.stop(True, mo.callout(mo.md(f"**Invalid YAML:** {_e}"), kind="danger"))
|
| 247 |
|
| 248 |
if not _parsed["profiles"][0]["columns"]:
|
| 249 |
-
mo.stop(
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
not_found = []
|
| 252 |
for declared_column in _parsed["profiles"][0]["columns"]:
|
|
@@ -254,7 +261,12 @@ def _(df_selected, load_profile, mo, yaml, yaml_editor):
|
|
| 254 |
not_found.append(declared_column)
|
| 255 |
|
| 256 |
if not_found:
|
| 257 |
-
mo.stop(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
try:
|
| 260 |
_profile_name = _parsed["profiles"][0]["name"]
|
|
@@ -282,15 +294,6 @@ def _(get_definition_dir):
|
|
| 282 |
return (definition_options,)
|
| 283 |
|
| 284 |
|
| 285 |
-
@app.cell
|
| 286 |
-
def _(definition_options, json):
|
| 287 |
-
def load_definition(name: str) -> dict:
|
| 288 |
-
letter_dir = name[0].lower()
|
| 289 |
-
return json.loads(definition_options[name].read_text())
|
| 290 |
-
|
| 291 |
-
return (load_definition,)
|
| 292 |
-
|
| 293 |
-
|
| 294 |
@app.cell
|
| 295 |
def _(definition_options, mo):
|
| 296 |
definitions_dropdown = mo.ui.multiselect(
|
|
@@ -299,6 +302,30 @@ def _(definition_options, mo):
|
|
| 299 |
return (definitions_dropdown,)
|
| 300 |
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
@app.cell
|
| 303 |
def _(definitions_dropdown, mo):
|
| 304 |
mo.hstack([mo.md("**::lucide:filter:: Filters:**"), definitions_dropdown])
|
|
@@ -316,9 +343,7 @@ def _(
|
|
| 316 |
profile,
|
| 317 |
):
|
| 318 |
mo.stop(
|
| 319 |
-
df_selected is None
|
| 320 |
-
or df_selected.is_empty()
|
| 321 |
-
or not definitions_dropdown.value
|
| 322 |
)
|
| 323 |
|
| 324 |
definitions = definitions_dropdown.value
|
|
@@ -327,8 +352,7 @@ def _(
|
|
| 327 |
engine = OSDEngine(profile, skip_unresolvable=True)
|
| 328 |
|
| 329 |
defs_dict = {
|
| 330 |
-
name: json.loads(definition_options[name].read_text())
|
| 331 |
-
for name in definitions
|
| 332 |
}
|
| 333 |
df_filtered = engine.label(df_selected, defs_dict)
|
| 334 |
return definitions, df_filtered
|
|
@@ -340,55 +364,31 @@ def _(definitions, df_filtered, df_selected, mo):
|
|
| 340 |
|
| 341 |
_cards = [
|
| 342 |
mo.stat(
|
| 343 |
-
label="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
value=df_selected.shape[0],
|
| 345 |
),
|
| 346 |
mo.stat(
|
| 347 |
label="Columns",
|
| 348 |
value=df_selected.shape[1],
|
| 349 |
),
|
| 350 |
-
mo.stat(
|
| 351 |
-
label="Syndromic Indicators",
|
| 352 |
-
value=len(definitions),
|
| 353 |
-
caption=", ".join([definition for definition in definitions]),
|
| 354 |
-
bordered=True,
|
| 355 |
-
),
|
| 356 |
]
|
| 357 |
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
mo.vstack(
|
| 361 |
-
[
|
| 362 |
-
mo.md(_title),
|
| 363 |
-
mo.hstack(_cards, widths="equal", align="center"),
|
| 364 |
-
]
|
| 365 |
-
)
|
| 366 |
return
|
| 367 |
|
| 368 |
|
| 369 |
@app.cell
|
| 370 |
-
def _(
|
| 371 |
-
|
|
|
|
| 372 |
|
| 373 |
-
|
| 374 |
-
[
|
| 375 |
-
mo.md("### Definitions details"),
|
| 376 |
-
mo.md(
|
| 377 |
-
"This section shows the definitions used to filter the data. You can use them to understand how the data was filtered and what criteria were applied. π"
|
| 378 |
-
),
|
| 379 |
-
mo.ui.tabs(
|
| 380 |
-
{
|
| 381 |
-
"JSONs": mo.accordion(
|
| 382 |
-
{
|
| 383 |
-
definition: mo.json(load_definition(definition))
|
| 384 |
-
for definition in definitions
|
| 385 |
-
}
|
| 386 |
-
),
|
| 387 |
-
},
|
| 388 |
-
),
|
| 389 |
-
]
|
| 390 |
-
)
|
| 391 |
-
return
|
| 392 |
|
| 393 |
|
| 394 |
@app.cell
|
|
@@ -414,7 +414,7 @@ def _(
|
|
| 414 |
|
| 415 |
if code_column:
|
| 416 |
diagnosis_chart = [
|
| 417 |
-
mo.md("## Codes comparison per syndromic indicator"),
|
| 418 |
top_n.left(),
|
| 419 |
groupped_bar(
|
| 420 |
df_filtered,
|
|
@@ -426,24 +426,46 @@ def _(
|
|
| 426 |
else:
|
| 427 |
diagnosis_chart = []
|
| 428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
mo.vstack(
|
| 430 |
[
|
| 431 |
-
mo.md("##
|
| 432 |
-
|
| 433 |
-
*diagnosis_chart
|
| 434 |
]
|
| 435 |
)
|
| 436 |
return
|
| 437 |
|
| 438 |
|
| 439 |
@app.cell
|
| 440 |
-
def _(definitions,
|
| 441 |
-
mo.stop(
|
| 442 |
|
| 443 |
mo.vstack(
|
| 444 |
[
|
| 445 |
-
mo.md("###
|
| 446 |
-
mo.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
]
|
| 448 |
)
|
| 449 |
return
|
|
|
|
| 1 |
import marimo
|
| 2 |
|
| 3 |
+
__generated_with = "0.14.16"
|
| 4 |
app = marimo.App(
|
| 5 |
width="medium",
|
| 6 |
app_title="Open Syndrome Definition - Data Browser",
|
|
|
|
| 18 |
|
| 19 |
import yaml
|
| 20 |
from opensyndrome.filter import OSDEngine, load_profile
|
| 21 |
+
from opensyndrome.artifacts import get_definition_dir
|
| 22 |
|
| 23 |
return (
|
| 24 |
OSDEngine,
|
|
|
|
| 35 |
|
| 36 |
@app.cell
|
| 37 |
def _(go, pl):
|
| 38 |
+
def plot_cases(
|
| 39 |
+
_df_filtered, definitions, date_column="date", date_format="%Y-%m-%d %H:%M:%S"
|
| 40 |
+
):
|
| 41 |
_definitions_columns_sum = [
|
| 42 |
pl.col(definition).sum().alias(definition) for definition in definitions
|
| 43 |
]
|
| 44 |
_agg_df = (
|
| 45 |
_df_filtered.with_columns(
|
| 46 |
+
pl.col(date_column)
|
| 47 |
+
.str.to_datetime(format=date_format, strict=False)
|
| 48 |
+
.cast(pl.Date)
|
| 49 |
+
.dt.truncate("1mo")
|
| 50 |
+
.alias("_month")
|
| 51 |
)
|
| 52 |
.group_by("_month")
|
| 53 |
.agg(_definitions_columns_sum)
|
|
|
|
| 100 |
|
| 101 |
@app.cell
|
| 102 |
def _(mo):
|
| 103 |
+
mo.md(r"""# Open Syndrome Definition π©π½βπ¬""")
|
|
|
|
|
|
|
| 104 |
return
|
| 105 |
|
| 106 |
|
|
|
|
| 119 |
@app.cell
|
| 120 |
def _(Path):
|
| 121 |
EXAMPLE_DATASETS = {
|
| 122 |
+
"Toy dataset": {
|
| 123 |
+
"csv": Path("toy_dataset.csv"),
|
| 124 |
"mapping": Path("mapping.yaml"),
|
| 125 |
"date_column": "recording_ts",
|
| 126 |
},
|
|
|
|
| 164 |
else None
|
| 165 |
)
|
| 166 |
else:
|
| 167 |
+
df_selected = pl.read_csv(sample_file.contents()) if sample_file.value else None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
return (df_selected,)
|
| 169 |
|
| 170 |
|
|
|
|
| 223 |
|
| 224 |
mo.vstack(
|
| 225 |
[
|
| 226 |
+
mo.md("### Mapping your data to the format"),
|
| 227 |
mo.md(
|
| 228 |
"Edit the YAML below to map your dataset columns to OSD concepts, "
|
| 229 |
"then click **Submit**. "
|
| 230 |
"Select the date column separately for the time-series view.\n\n"
|
| 231 |
f"Your dataset columns: `{_cols_hint}`"
|
| 232 |
),
|
| 233 |
+
mo.hstack(
|
| 234 |
+
[yaml_editor, mo.vstack([date_column_picker, date_format_input])],
|
| 235 |
+
widths=[3, 1],
|
| 236 |
+
align="start",
|
| 237 |
+
),
|
| 238 |
]
|
| 239 |
)
|
| 240 |
return date_column_picker, date_format_input, yaml_editor
|
|
|
|
| 250 |
mo.stop(True, mo.callout(mo.md(f"**Invalid YAML:** {_e}"), kind="danger"))
|
| 251 |
|
| 252 |
if not _parsed["profiles"][0]["columns"]:
|
| 253 |
+
mo.stop(
|
| 254 |
+
True,
|
| 255 |
+
mo.callout(mo.md("You need to map **at least one column**"), kind="danger"),
|
| 256 |
+
)
|
| 257 |
|
| 258 |
not_found = []
|
| 259 |
for declared_column in _parsed["profiles"][0]["columns"]:
|
|
|
|
| 261 |
not_found.append(declared_column)
|
| 262 |
|
| 263 |
if not_found:
|
| 264 |
+
mo.stop(
|
| 265 |
+
True,
|
| 266 |
+
mo.callout(
|
| 267 |
+
mo.md(f"**Columns not found:** {', '.join(not_found)}"), kind="danger"
|
| 268 |
+
),
|
| 269 |
+
)
|
| 270 |
|
| 271 |
try:
|
| 272 |
_profile_name = _parsed["profiles"][0]["name"]
|
|
|
|
| 294 |
return (definition_options,)
|
| 295 |
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
@app.cell
|
| 298 |
def _(definition_options, mo):
|
| 299 |
definitions_dropdown = mo.ui.multiselect(
|
|
|
|
| 302 |
return (definitions_dropdown,)
|
| 303 |
|
| 304 |
|
| 305 |
+
@app.cell
|
| 306 |
+
def _(mo):
|
| 307 |
+
mo.md(r"""### Data sample""")
|
| 308 |
+
return
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
@app.cell
|
| 312 |
+
def _(df_selected):
|
| 313 |
+
df_selected.sample(10)
|
| 314 |
+
return
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
@app.cell
|
| 318 |
+
def _(mo):
|
| 319 |
+
mo.md(r"""---""")
|
| 320 |
+
return
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
@app.cell
|
| 324 |
+
def _(mo):
|
| 325 |
+
mo.md(r"""## Data & Definitions""")
|
| 326 |
+
return
|
| 327 |
+
|
| 328 |
+
|
| 329 |
@app.cell
|
| 330 |
def _(definitions_dropdown, mo):
|
| 331 |
mo.hstack([mo.md("**::lucide:filter:: Filters:**"), definitions_dropdown])
|
|
|
|
| 343 |
profile,
|
| 344 |
):
|
| 345 |
mo.stop(
|
| 346 |
+
df_selected is None or df_selected.is_empty() or not definitions_dropdown.value
|
|
|
|
|
|
|
| 347 |
)
|
| 348 |
|
| 349 |
definitions = definitions_dropdown.value
|
|
|
|
| 352 |
engine = OSDEngine(profile, skip_unresolvable=True)
|
| 353 |
|
| 354 |
defs_dict = {
|
| 355 |
+
name: json.loads(definition_options[name].read_text()) for name in definitions
|
|
|
|
| 356 |
}
|
| 357 |
df_filtered = engine.label(df_selected, defs_dict)
|
| 358 |
return definitions, df_filtered
|
|
|
|
| 364 |
|
| 365 |
_cards = [
|
| 366 |
mo.stat(
|
| 367 |
+
label="Syndromic Indicators",
|
| 368 |
+
value=len(definitions),
|
| 369 |
+
caption=", ".join([definition for definition in definitions]),
|
| 370 |
+
bordered=True,
|
| 371 |
+
),
|
| 372 |
+
mo.stat(
|
| 373 |
+
label="Rows",
|
| 374 |
value=df_selected.shape[0],
|
| 375 |
),
|
| 376 |
mo.stat(
|
| 377 |
label="Columns",
|
| 378 |
value=df_selected.shape[1],
|
| 379 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
]
|
| 381 |
|
| 382 |
+
mo.hstack(_cards, widths="equal", align="center")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
return
|
| 384 |
|
| 385 |
|
| 386 |
@app.cell
|
| 387 |
+
def _(definition_options, json):
|
| 388 |
+
def load_definition(name: str) -> dict:
|
| 389 |
+
return json.loads(definition_options[name].read_text())
|
| 390 |
|
| 391 |
+
return (load_definition,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
|
| 394 |
@app.cell
|
|
|
|
| 414 |
|
| 415 |
if code_column:
|
| 416 |
diagnosis_chart = [
|
| 417 |
+
mo.md("### Codes comparison per syndromic indicator"),
|
| 418 |
top_n.left(),
|
| 419 |
groupped_bar(
|
| 420 |
df_filtered,
|
|
|
|
| 426 |
else:
|
| 427 |
diagnosis_chart = []
|
| 428 |
|
| 429 |
+
timeseries = [
|
| 430 |
+
mo.md("### Time series"),
|
| 431 |
+
plot_cases(
|
| 432 |
+
df_filtered,
|
| 433 |
+
definitions,
|
| 434 |
+
date_column=date_column,
|
| 435 |
+
date_format=date_format_input.value,
|
| 436 |
+
),
|
| 437 |
+
]
|
| 438 |
+
|
| 439 |
mo.vstack(
|
| 440 |
[
|
| 441 |
+
mo.md("## Findings from the data π"),
|
| 442 |
+
*timeseries,
|
| 443 |
+
*diagnosis_chart,
|
| 444 |
]
|
| 445 |
)
|
| 446 |
return
|
| 447 |
|
| 448 |
|
| 449 |
@app.cell
|
| 450 |
+
def _(definitions, load_definition, mo):
|
| 451 |
+
mo.stop(definitions is None)
|
| 452 |
|
| 453 |
mo.vstack(
|
| 454 |
[
|
| 455 |
+
mo.md("### Definitions details"),
|
| 456 |
+
mo.md(
|
| 457 |
+
"Here the definitions used to filter the data. See here what criteria were applied. π"
|
| 458 |
+
),
|
| 459 |
+
mo.accordion(
|
| 460 |
+
{
|
| 461 |
+
"JSONs": mo.accordion(
|
| 462 |
+
{
|
| 463 |
+
definition: mo.json(load_definition(definition))
|
| 464 |
+
for definition in definitions
|
| 465 |
+
}
|
| 466 |
+
),
|
| 467 |
+
},
|
| 468 |
+
),
|
| 469 |
]
|
| 470 |
)
|
| 471 |
return
|
fake_dataset.csv β toy_dataset.csv
RENAMED
|
File without changes
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|