#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2026-01-29 22:30:00 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-dataset/src/scitex_dataset/__init__.py
"""
SciTeX Dataset - Unified interface for scientific dataset discovery.
Domains:
- neuroscience: OpenNeuro, DANDI, PhysioNet
- general: Scientific Data, Zenodo
- biology: GEO (Gene Expression Omnibus)
- pharmacology: ChEMBL
- medical: ClinicalTrials.gov
Usage:
>>> from scitex_dataset import neuroscience
>>> datasets = neuroscience.fetch_all_datasets(max_datasets=10)
>>> # Or direct import for convenience
>>> from scitex_dataset import fetch_all_datasets, search_datasets
>>> # Local database for fast searching
>>> from scitex_dataset import database as db
>>> db.build() # Fetch all sources and index
>>> results = db.search("alzheimer EEG", min_subjects=20)
"""
from __future__ import annotations
try:
from importlib.metadata import PackageNotFoundError
from importlib.metadata import version as _v
try:
__version__ = _v("scitex-dataset")
except PackageNotFoundError:
__version__ = "0.0.0+local"
del _v, PackageNotFoundError
except ImportError: # pragma: no cover — only on ancient Pythons
__version__ = "0.0.0+local"
# Domain submodules
from . import _api as _api # noqa: F401
from . import biology, database, general, medical, neuroscience, pharmacology
# Per-source ``<src>_fetch`` / ``<src>_format`` aliases — give every MCP
# tool a matching Python callable for the audit-mcp-tools § 6 parity
# check. See _api.py for the explicit list.
from ._api import ( # noqa: F401
chembl_fetch,
clinicaltrials_fetch,
dandi_fetch,
figshare_fetch,
geo_fetch,
huggingface_download_file,
huggingface_fetch,
huggingface_info,
huggingface_search,
moleculenet_fetch,
openml_fetch,
openneuro_fetch,
physionet_fetch,
zenodo_fetch,
)
# DB-level aliases for MCP parity (`dataset_db_*` tools).
from .database import build as db_build # noqa: F401
from .database import get_stats as db_show_stats # noqa: F401
from .database import search as db_search # noqa: F401
# Convenience exports from neuroscience.openneuro (primary source)
from .neuroscience.openneuro import (
OPENNEURO_API,
fetch_all_datasets,
fetch_datasets,
format_dataset,
)
from .search import search_datasets, sort_datasets
[docs]
def list_sources() -> dict:
"""Return the 11-source registry — matches ``dataset_list_sources`` MCP tool."""
from ._sources import SOURCE_INFO
return {"sources": SOURCE_INFO, "count": len(SOURCE_INFO)}
[docs]
def filter_results(datasets, **kwargs):
"""Filter and rank dataset dicts — matches ``dataset_filter_results`` MCP tool."""
from .search import search_datasets as _search
from .search import sort_datasets as _sort
sort_by = kwargs.pop("sort_by", "downloads")
limit = kwargs.pop("limit", None)
out = _search(datasets, **kwargs)
out = _sort(out, by=sort_by, descending=True)
return out[:limit] if limit else out
# Public Python API surface — kept in lock-step with the MCP tool set
# under `_mcp/_tools/` so `scitex-dev ecosystem audit-mcp-tools` § 6
# parity passes without skip_rules masking.
#
# The domain submodules (``neuroscience``, ``biology``, …) and the
# ``database`` module remain importable via ordinary attribute access
# (``scitex_dataset.biology.fetch_all_datasets``) — they are simply
# excluded from ``__all__`` so the audit doesn't flatten their
# convenience re-exports into a second, MCP-less Python surface. The
# bare convenience aliases ``fetch_*`` / ``format_dataset`` / ``search_*``
# / ``sort_*`` are likewise omitted: they are OpenNeuro-only shortcuts
# that masquerade as domain-level functions and have no matching MCP
# tool; the source-explicit ``openneuro_fetch`` is the supported entry.
__all__ = [
"__version__",
# Database aliases (MCP parity with ``dataset_db_*`` tools)
"db_build",
"db_search",
"db_show_stats",
# Search + filter (MCP parity with ``dataset_filter_results`` /
# ``dataset_list_sources``)
"filter_results",
"list_sources",
# Per-source ``<src>_fetch`` aliases (MCP parity with
# ``dataset_<src>_fetch`` tools)
"openneuro_fetch",
"dandi_fetch",
"physionet_fetch",
"zenodo_fetch",
"figshare_fetch",
"openml_fetch",
"moleculenet_fetch",
"geo_fetch",
"chembl_fetch",
"clinicaltrials_fetch",
# HuggingFace family
"huggingface_fetch",
"huggingface_search",
"huggingface_info",
"huggingface_download_file",
]
# EOF