Source code for scitex_dataset.search

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2026-01-29 22:27:00 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-dataset/src/scitex_dataset/search.py

"""
Unified search interface for neuroscience datasets.

Currently supports:
- OpenNeuro (BIDS neuroimaging)

Future sources:
- DANDI (NWB neurophysiology)
- PhysioNet (EEG/ECG/physiology)
- Zenodo (general scientific)
"""

from __future__ import annotations

from typing import Optional

from scitex_dev.decorators import supports_return_as


[docs] @supports_return_as def search_datasets( datasets: list[dict], modality: Optional[str] = None, min_subjects: Optional[int] = None, max_subjects: Optional[int] = None, task_contains: Optional[str] = None, text_query: Optional[str] = None, min_downloads: Optional[int] = None, has_readme: bool = False, ) -> list[dict]: """Filter datasets by various criteria. Args: datasets: List of formatted dataset dictionaries modality: Filter by modality (e.g., "mri", "eeg", "meg") min_subjects: Minimum number of subjects max_subjects: Maximum number of subjects task_contains: Filter by task name substring text_query: Search in name and readme text min_downloads: Minimum download count has_readme: Only include datasets with readme Returns: Filtered list of datasets Example: >>> from scitex_dataset import fetch_all_datasets, format_dataset >>> from scitex_dataset.search import search_datasets >>> raw = fetch_all_datasets(max_datasets=100) >>> datasets = [format_dataset(d) for d in raw] >>> eeg_data = search_datasets(datasets, modality="eeg", min_subjects=20) """ results = datasets if modality: modality_lower = modality.lower() results = [ d for d in results if modality_lower in [m.lower() for m in (d.get("modalities") or [])] or modality_lower == (d.get("primary_modality") or "").lower() ] if min_subjects is not None: results = [d for d in results if d.get("n_subjects", 0) >= min_subjects] if max_subjects is not None: results = [d for d in results if d.get("n_subjects", 0) <= max_subjects] if task_contains: task_lower = task_contains.lower() results = [ d for d in results if any(task_lower in t.lower() for t in (d.get("tasks") or [])) ] if text_query: query = text_query.lower() results = [ d for d in results if query in (d.get("name") or "").lower() or query in (d.get("readme") or "").lower() ] if min_downloads is not None: results = [d for d in results if (d.get("downloads") or 0) >= min_downloads] if has_readme: results = [d for d in results if d.get("readme")] return results
[docs] @supports_return_as def sort_datasets( datasets: list[dict], by: str = "downloads", descending: bool = True, ) -> list[dict]: """Sort datasets by a field. Args: datasets: List of formatted dataset dictionaries by: Field to sort by (downloads, views, n_subjects, size_gb, created) descending: Sort in descending order Returns: Sorted list of datasets """ def get_key(d: dict): val = d.get(by) if val is None: return 0 if descending else float("inf") return val return sorted(datasets, key=get_key, reverse=descending)
# EOF