From 2e79229042b859a817c4e75f54deb4b62ad294c5 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 00:10:23 -0600 Subject: [PATCH 01/10] feat: Add external storage and filepath migration functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add migrate_external() and migrate_filepath() to datajoint.migrate module for safe migration of 0.x external storage columns to 2.0 JSON format. Migration strategy: 1. Add new _v2 columns with JSON type 2. Copy and convert data from old columns 3. User verifies data accessible via DataJoint 2.0 4. Finalize: rename columns (old → _v1, new → original) This allows 0.x and 2.0 to coexist during migration and provides rollback capability if issues are discovered. Functions: - migrate_external(schema, dry_run=True, finalize=False) - migrate_filepath(schema, dry_run=True, finalize=False) - _find_external_columns(schema) - detect 0.x external columns - _find_filepath_columns(schema) - detect 0.x filepath columns Co-Authored-By: Claude Opus 4.5 --- src/datajoint/migrate.py | 555 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 555 insertions(+) diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index 3a2bf2ce6..b72896d0a 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -8,6 +8,7 @@ from __future__ import annotations +import json import logging import re from typing import TYPE_CHECKING @@ -19,6 +20,14 @@ logger = logging.getLogger(__name__.split(".")[0]) +# Patterns for detecting 0.x external storage columns +EXTERNAL_PATTERNS = { + "blob": re.compile(r":external(?:-([a-zA-Z_][a-zA-Z0-9_]*))?:", re.I), + "attach": re.compile(r":external-attach(?:-([a-zA-Z_][a-zA-Z0-9_]*))?:", re.I), +} + +FILEPATH_PATTERN = re.compile(r":filepath(?:-([a-zA-Z_][a-zA-Z0-9_]*))?:", re.I) + # Pattern to detect blob types BLOB_TYPES = re.compile(r"^(tiny|small|medium|long|)blob$", re.I) @@ -450,3 +459,549 @@ def add_job_metadata_columns(target, dry_run: bool = True) -> dict: result["details"].append(table_detail) return result + + +# ============================================================================= +# External Storage Migration (Phase 6) +# ============================================================================= + + +def _find_external_columns(schema: Schema) -> list[dict]: + """ + Find columns using 0.x external storage format. + + Returns list of dicts with column info and detected store name. + """ + connection = schema.connection + results = [] + + # Get all tables (excluding hidden tables) + tables_query = """ + SELECT TABLE_NAME + FROM information_schema.TABLES + WHERE TABLE_SCHEMA = %s + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_NAME NOT LIKE '~%%' + """ + tables = connection.query(tables_query, args=(schema.database,)).fetchall() + + for (table_name,) in tables: + # Find BINARY(16) columns (0.x external storage format) + columns_query = """ + SELECT COLUMN_NAME, COLUMN_TYPE, COLUMN_COMMENT + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s + AND TABLE_NAME = %s + AND DATA_TYPE = 'binary' + AND CHARACTER_MAXIMUM_LENGTH = 16 + """ + columns = connection.query( + columns_query, args=(schema.database, table_name) + ).fetchall() + + for column_name, column_type, comment in columns: + comment = comment or "" + + # Check for external blob pattern + blob_match = EXTERNAL_PATTERNS["blob"].search(comment) + if blob_match: + store_name = blob_match.group(1) or "external" + results.append({ + "table_name": table_name, + "column_name": column_name, + "column_type": column_type, + "comment": comment, + "store_name": store_name, + "external_type": "blob", + }) + continue + + # Check for external attach pattern + attach_match = EXTERNAL_PATTERNS["attach"].search(comment) + if attach_match: + store_name = attach_match.group(1) or "external" + results.append({ + "table_name": table_name, + "column_name": column_name, + "column_type": column_type, + "comment": comment, + "store_name": store_name, + "external_type": "attach", + }) + + return results + + +def _find_filepath_columns(schema: Schema) -> list[dict]: + """ + Find columns using 0.x filepath format. + + Returns list of dicts with column info and detected store name. + """ + connection = schema.connection + results = [] + + # Get all tables (excluding hidden tables) + tables_query = """ + SELECT TABLE_NAME + FROM information_schema.TABLES + WHERE TABLE_SCHEMA = %s + AND TABLE_TYPE = 'BASE TABLE' + AND TABLE_NAME NOT LIKE '~%%' + """ + tables = connection.query(tables_query, args=(schema.database,)).fetchall() + + for (table_name,) in tables: + # Find VARCHAR columns with :filepath: in comment + columns_query = """ + SELECT COLUMN_NAME, COLUMN_TYPE, COLUMN_COMMENT + FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s + AND TABLE_NAME = %s + AND DATA_TYPE = 'varchar' + AND COLUMN_COMMENT LIKE '%%:filepath%%' + """ + columns = connection.query( + columns_query, args=(schema.database, table_name) + ).fetchall() + + for column_name, column_type, comment in columns: + comment = comment or "" + match = FILEPATH_PATTERN.search(comment) + if match: + store_name = match.group(1) or "external" + results.append({ + "table_name": table_name, + "column_name": column_name, + "column_type": column_type, + "comment": comment, + "store_name": store_name, + }) + + return results + + +def migrate_external( + schema: Schema, + dry_run: bool = True, + finalize: bool = False, +) -> dict: + """ + Migrate external storage columns from 0.x to 2.0 format. + + This migration uses a safe, multi-step approach: + + 1. **Initial run** (dry_run=False): Adds new `_v2` columns with JSON + type and copies data from the old columns, converting UUID references to + JSON metadata. + + 2. **Verification**: You verify all data is accessible via DataJoint 2.0. + + 3. **Finalize** (finalize=True): Renames columns (old → `_v1`, new → original + name) and optionally drops the old columns. + + This allows 0.x and 2.0 to coexist during migration and provides a rollback + path if issues are discovered. + + Parameters + ---------- + schema : Schema + The DataJoint schema to migrate. + dry_run : bool, optional + If True, only preview changes without applying. Default True. + finalize : bool, optional + If True, rename migrated columns to original names and drop old columns. + Only run after verifying migration succeeded. Default False. + + Returns + ------- + dict + Migration results with keys: + + - columns_found: Number of external columns found + - columns_migrated: Number of columns processed + - rows_migrated: Number of rows with data converted + - details: Per-column migration details + + Examples + -------- + >>> from datajoint.migration import migrate_external + >>> + >>> # Step 1: Preview + >>> result = migrate_external(schema, dry_run=True) + >>> print(f"Found {result['columns_found']} columns to migrate") + >>> + >>> # Step 2: Run migration (adds new columns) + >>> result = migrate_external(schema, dry_run=False) + >>> print(f"Migrated {result['rows_migrated']} rows") + >>> + >>> # Step 3: Verify data is accessible via DataJoint 2.0 + >>> # ... manual verification ... + >>> + >>> # Step 4: Finalize (rename columns, drop old) + >>> result = migrate_external(schema, finalize=True) + + Notes + ----- + The migration reads from the hidden `~external_` tables to build + JSON metadata. Ensure store configuration in datajoint.json matches the + paths stored in these tables. + """ + columns = _find_external_columns(schema) + connection = schema.connection + database = schema.database + + result = { + "columns_found": len(columns), + "columns_migrated": 0, + "rows_migrated": 0, + "details": [], + } + + if not columns: + logger.info(f"No external columns found in {database}") + return result + + for col in columns: + table_name = col["table_name"] + column_name = col["column_name"] + store_name = col["store_name"] + external_type = col["external_type"] + old_comment = col["comment"] + + detail = { + "table": f"{database}.{table_name}", + "column": column_name, + "store": store_name, + "type": external_type, + "status": "pending", + "rows": 0, + } + + # Build new comment + codec = "blob" if external_type == "blob" else "attach" + # Remove old :external...: pattern from comment + new_comment = EXTERNAL_PATTERNS[external_type].sub("", old_comment).strip() + new_comment = f":{codec}@{store_name}: {new_comment}".strip() + + new_column = f"{column_name}_v2" + + if finalize: + # Finalize: rename columns + detail["action"] = "finalize" + + if dry_run: + logger.info( + f"Would finalize {database}.{table_name}.{column_name}: " + f"rename {column_name} → {column_name}_v1, " + f"{new_column} → {column_name}" + ) + detail["status"] = "dry_run" + else: + try: + # Rename old column to _v1 + sql = ( + f"ALTER TABLE `{database}`.`{table_name}` " + f"CHANGE COLUMN `{column_name}` `{column_name}_v1` " + f"{col['column_type']} COMMENT 'legacy 0.x'" + ) + connection.query(sql) + + # Rename new column to original name + sql = ( + f"ALTER TABLE `{database}`.`{table_name}` " + f"CHANGE COLUMN `{new_column}` `{column_name}` " + f"JSON COMMENT '{new_comment}'" + ) + connection.query(sql) + + detail["status"] = "finalized" + result["columns_migrated"] += 1 + logger.info(f"Finalized {database}.{table_name}.{column_name}") + except Exception as e: + detail["status"] = "error" + detail["error"] = str(e) + logger.error(f"Failed to finalize {table_name}.{column_name}: {e}") + raise DataJointError(f"Finalize failed: {e}") from e + else: + # Initial migration: add new column and copy data + detail["action"] = "migrate" + + # Check if _v2 column already exists + existing = connection.query( + """ + SELECT COLUMN_NAME FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s + """, + args=(database, table_name, new_column), + ).fetchone() + + if existing: + detail["status"] = "already_migrated" + logger.info(f"Column {new_column} already exists, skipping") + result["details"].append(detail) + continue + + if dry_run: + # Count rows that would be migrated + count_sql = f""" + SELECT COUNT(*) FROM `{database}`.`{table_name}` + WHERE `{column_name}` IS NOT NULL + """ + count = connection.query(count_sql).fetchone()[0] + detail["rows"] = count + detail["status"] = "dry_run" + logger.info( + f"Would migrate {database}.{table_name}.{column_name}: " + f"{count} rows, store={store_name}" + ) + else: + try: + # Add new JSON column + sql = ( + f"ALTER TABLE `{database}`.`{table_name}` " + f"ADD COLUMN `{new_column}` JSON " + f"COMMENT '{new_comment}'" + ) + connection.query(sql) + + # Copy and convert data from old column + # Query the external table for metadata + external_table = f"~external_{store_name}" + + # Get store config for URL building + from .settings import config + store_config = config.get("stores", {}).get(store_name, {}) + protocol = store_config.get("protocol", "file") + location = store_config.get("location", "") + + # Update rows with JSON metadata + update_sql = f""" + UPDATE `{database}`.`{table_name}` t + JOIN `{database}`.`{external_table}` e + ON t.`{column_name}` = e.hash + SET t.`{new_column}` = JSON_OBJECT( + 'url', CONCAT('{protocol}://', '{location}/', e.filepath), + 'size', e.size, + 'hash', HEX(e.hash) + ) + WHERE t.`{column_name}` IS NOT NULL + """ + connection.query(update_sql) + + # Count migrated rows + count_sql = f""" + SELECT COUNT(*) FROM `{database}`.`{table_name}` + WHERE `{new_column}` IS NOT NULL + """ + count = connection.query(count_sql).fetchone()[0] + detail["rows"] = count + detail["status"] = "migrated" + result["columns_migrated"] += 1 + result["rows_migrated"] += count + + logger.info( + f"Migrated {database}.{table_name}.{column_name}: " + f"{count} rows" + ) + except Exception as e: + detail["status"] = "error" + detail["error"] = str(e) + logger.error( + f"Failed to migrate {table_name}.{column_name}: {e}" + ) + raise DataJointError(f"Migration failed: {e}") from e + + result["details"].append(detail) + + return result + + +def migrate_filepath( + schema: Schema, + dry_run: bool = True, + finalize: bool = False, +) -> dict: + """ + Migrate filepath columns from 0.x to 2.0 format. + + Same multi-step approach as migrate_external: + + 1. **Initial run**: Adds new `_v2` columns with JSON type + 2. **Verification**: Verify files accessible via DataJoint 2.0 + 3. **Finalize**: Rename columns and drop old + + Parameters + ---------- + schema : Schema + The DataJoint schema to migrate. + dry_run : bool, optional + If True, only preview changes. Default True. + finalize : bool, optional + If True, finalize migration. Default False. + + Returns + ------- + dict + Migration results (same format as migrate_external). + + Examples + -------- + >>> from datajoint.migration import migrate_filepath + >>> + >>> # Preview + >>> result = migrate_filepath(schema, dry_run=True) + >>> + >>> # Run migration + >>> result = migrate_filepath(schema, dry_run=False) + >>> + >>> # Finalize after verification + >>> result = migrate_filepath(schema, finalize=True) + """ + columns = _find_filepath_columns(schema) + connection = schema.connection + database = schema.database + + result = { + "columns_found": len(columns), + "columns_migrated": 0, + "rows_migrated": 0, + "details": [], + } + + if not columns: + logger.info(f"No filepath columns found in {database}") + return result + + for col in columns: + table_name = col["table_name"] + column_name = col["column_name"] + store_name = col["store_name"] + old_comment = col["comment"] + + detail = { + "table": f"{database}.{table_name}", + "column": column_name, + "store": store_name, + "status": "pending", + "rows": 0, + } + + # Build new comment + new_comment = FILEPATH_PATTERN.sub("", old_comment).strip() + new_comment = f":filepath@{store_name}: {new_comment}".strip() + + new_column = f"{column_name}_v2" + + if finalize: + detail["action"] = "finalize" + + if dry_run: + logger.info( + f"Would finalize {database}.{table_name}.{column_name}" + ) + detail["status"] = "dry_run" + else: + try: + # Rename old column to _v1 + sql = ( + f"ALTER TABLE `{database}`.`{table_name}` " + f"CHANGE COLUMN `{column_name}` `{column_name}_v1` " + f"{col['column_type']} COMMENT 'legacy 0.x'" + ) + connection.query(sql) + + # Rename new column to original name + sql = ( + f"ALTER TABLE `{database}`.`{table_name}` " + f"CHANGE COLUMN `{new_column}` `{column_name}` " + f"JSON COMMENT '{new_comment}'" + ) + connection.query(sql) + + detail["status"] = "finalized" + result["columns_migrated"] += 1 + logger.info(f"Finalized {database}.{table_name}.{column_name}") + except Exception as e: + detail["status"] = "error" + detail["error"] = str(e) + logger.error(f"Failed to finalize: {e}") + raise DataJointError(f"Finalize failed: {e}") from e + else: + detail["action"] = "migrate" + + # Check if _v2 column already exists + existing = connection.query( + """ + SELECT COLUMN_NAME FROM information_schema.COLUMNS + WHERE TABLE_SCHEMA = %s AND TABLE_NAME = %s AND COLUMN_NAME = %s + """, + args=(database, table_name, new_column), + ).fetchone() + + if existing: + detail["status"] = "already_migrated" + result["details"].append(detail) + continue + + if dry_run: + count_sql = f""" + SELECT COUNT(*) FROM `{database}`.`{table_name}` + WHERE `{column_name}` IS NOT NULL + """ + count = connection.query(count_sql).fetchone()[0] + detail["rows"] = count + detail["status"] = "dry_run" + logger.info( + f"Would migrate {database}.{table_name}.{column_name}: " + f"{count} rows" + ) + else: + try: + # Get store config + from .settings import config + store_config = config.get("stores", {}).get(store_name, {}) + protocol = store_config.get("protocol", "file") + location = store_config.get("location", "") + + # Add new JSON column + sql = ( + f"ALTER TABLE `{database}`.`{table_name}` " + f"ADD COLUMN `{new_column}` JSON " + f"COMMENT '{new_comment}'" + ) + connection.query(sql) + + # Convert filepath to JSON with URL + update_sql = f""" + UPDATE `{database}`.`{table_name}` + SET `{new_column}` = JSON_OBJECT( + 'url', CONCAT('{protocol}://', '{location}/', `{column_name}`) + ) + WHERE `{column_name}` IS NOT NULL + """ + connection.query(update_sql) + + count_sql = f""" + SELECT COUNT(*) FROM `{database}`.`{table_name}` + WHERE `{new_column}` IS NOT NULL + """ + count = connection.query(count_sql).fetchone()[0] + detail["rows"] = count + detail["status"] = "migrated" + result["columns_migrated"] += 1 + result["rows_migrated"] += count + + logger.info( + f"Migrated {database}.{table_name}.{column_name}: " + f"{count} rows" + ) + except Exception as e: + detail["status"] = "error" + detail["error"] = str(e) + logger.error(f"Failed to migrate: {e}") + raise DataJointError(f"Migration failed: {e}") from e + + result["details"].append(detail) + + return result From 08d5c6aaf2ac707538917e6c6019ac6efcc336b7 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 16:03:13 -0600 Subject: [PATCH 02/10] feat: Add NpyCodec for lazy-loading numpy arrays Implement the `` codec for schema-addressed numpy array storage: - Add SchemaCodec base class for path-addressed storage codecs - Add NpyRef class for lazy array references with metadata - Add NpyCodec using .npy format with shape/dtype inspection - Refactor ObjectCodec to inherit from SchemaCodec - Rename is_external to is_store throughout codebase - Export SchemaCodec and NpyRef from public API - Bump version to 2.0.0a17 Key features: - Lazy loading: inspect shape/dtype without downloading - NumPy integration via __array__ protocol - Safe bulk fetch: returns NpyRef objects, not arrays - Schema-addressed paths: {schema}/{table}/{pk}/{attr}.npy Co-Authored-By: Claude Opus 4.5 --- src/datajoint/__init__.py | 9 +- src/datajoint/builtin_codecs.py | 570 ++++++++++++++++++++++++--- src/datajoint/codecs.py | 18 +- src/datajoint/declare.py | 4 +- src/datajoint/heading.py | 6 +- src/datajoint/migrate.py | 91 ++--- src/datajoint/schemas.py | 30 +- src/datajoint/version.py | 2 +- tests/integration/test_codecs.py | 4 +- tests/integration/test_npy_codec.py | 439 +++++++++++++++++++++ tests/integration/test_privileges.py | 4 +- tests/integration/test_schema.py | 12 +- tests/schema_codecs.py | 4 +- tests/unit/test_codecs.py | 4 +- 14 files changed, 1048 insertions(+), 149 deletions(-) create mode 100644 tests/integration/test_npy_codec.py diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index ae8d308d2..3a049e110 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -47,8 +47,12 @@ "MatStruct", # Codec API "Codec", + "SchemaCodec", "list_codecs", "get_codec", + "ObjectRef", + "NpyRef", + # Other "errors", "migrate", "DataJointError", @@ -56,7 +60,6 @@ "key_hash", "logger", "cli", - "ObjectRef", "ValidationResult", ] @@ -70,6 +73,10 @@ get_codec, list_codecs, ) +from .builtin_codecs import ( + SchemaCodec, + NpyRef, +) from .blob import MatCell, MatStruct from .connection import Connection, conn from .errors import DataJointError diff --git a/src/datajoint/builtin_codecs.py b/src/datajoint/builtin_codecs.py index 66589dc36..499fec846 100644 --- a/src/datajoint/builtin_codecs.py +++ b/src/datajoint/builtin_codecs.py @@ -11,6 +11,7 @@ - ````: Path-addressed storage for files/folders (Zarr, HDF5) - ````: File attachment (internal) or external with dedup - ````: Reference to existing file in store + - ````: Store numpy arrays as portable .npy files (external only) Example - Creating a Custom Codec: Here's how to define your own codec, modeled after the built-in codecs:: @@ -23,7 +24,7 @@ class GraphCodec(dj.Codec): name = "graph" # Use as in definitions - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: return "" # Compose with blob for serialization def encode(self, graph, *, key=None, store_name=None): @@ -102,9 +103,9 @@ class ProcessedData(dj.Manual): name = "blob" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """Return bytes for internal, for external storage.""" - return "" if is_external else "bytes" + return "" if is_store else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: """Serialize a Python object to DataJoint's blob format.""" @@ -157,9 +158,9 @@ class RawContent(dj.Manual): name = "hash" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """Hash storage is external only.""" - if not is_external: + if not is_store: raise DataJointError(" requires @ (external storage only)") return "json" @@ -212,26 +213,186 @@ def validate(self, value: Any) -> None: # ============================================================================= -# Path-Addressed Storage Codec (OAS - Object-Augmented Schema) +# Schema-Addressed Storage Base Class # ============================================================================= -class ObjectCodec(Codec): +class SchemaCodec(Codec, register=False): """ - Path-addressed storage for files and folders. + Abstract base class for schema-addressed codecs. - The ```` codec provides managed file/folder storage where the path - is derived from the primary key: ``{schema}/{table}/{pk}/{field}/`` + Schema-addressed storage is an OAS (Object-Augmented Schema) addressing + scheme where paths mirror the database schema structure: + ``{schema}/{table}/{pk}/{attribute}``. This creates a browsable + organization in object storage that reflects the schema design. - Unlike ```` (hash-addressed), each row has its own storage path, - and content is deleted when the row is deleted. This is ideal for: + Subclasses must implement: + - ``name``: Codec name for ```` syntax + - ``encode()``: Serialize and upload content + - ``decode()``: Create lazy reference from metadata + - ``validate()``: Validate input values + + Helper Methods: + - ``_extract_context()``: Parse key dict into schema/table/field/pk + - ``_build_path()``: Construct storage path from context + - ``_get_backend()``: Get storage backend by name + + Comparison with Hash-addressed: + - **Schema-addressed** (this): Path from schema structure, no dedup + - **Hash-addressed**: Path from content hash, automatic dedup + + Example:: + + class MyCodec(SchemaCodec): + name = "my" + + def encode(self, value, *, key=None, store_name=None): + schema, table, field, pk = self._extract_context(key) + path, _ = self._build_path(schema, table, field, pk, ext=".dat") + backend = self._get_backend(store_name) + backend.put_buffer(serialize(value), path) + return {"path": path, "store": store_name, ...} + + def decode(self, stored, *, key=None): + backend = self._get_backend(stored.get("store")) + return MyRef(stored, backend) + + See Also + -------- + HashCodec : Hash-addressed storage with content deduplication. + ObjectCodec : Schema-addressed storage for files/folders. + NpyCodec : Schema-addressed storage for numpy arrays. + """ + + def get_dtype(self, is_store: bool) -> str: + """ + Return storage dtype. Schema-addressed codecs require @ modifier. + + Parameters + ---------- + is_store : bool + Must be True for schema-addressed codecs. + + Returns + ------- + str + "json" for metadata storage. + + Raises + ------ + DataJointError + If is_store is False (@ modifier missing). + """ + if not is_store: + raise DataJointError(f"<{self.name}> requires @ (store only)") + return "json" + + def _extract_context(self, key: dict | None) -> tuple[str, str, str, dict]: + """ + Extract schema, table, field, and primary key from context dict. + + Parameters + ---------- + key : dict or None + Context dict with ``_schema``, ``_table``, ``_field``, + and primary key values. + + Returns + ------- + tuple[str, str, str, dict] + ``(schema, table, field, primary_key)`` + """ + key = dict(key) if key else {} + schema = key.pop("_schema", "unknown") + table = key.pop("_table", "unknown") + field = key.pop("_field", "data") + primary_key = {k: v for k, v in key.items() if not k.startswith("_")} + return schema, table, field, primary_key + + def _build_path( + self, + schema: str, + table: str, + field: str, + primary_key: dict, + ext: str | None = None, + ) -> tuple[str, str]: + """ + Build schema-addressed storage path. + + Constructs a path that mirrors the database schema structure: + ``{schema}/{table}/{pk_values}/{field}{ext}`` + + Parameters + ---------- + schema : str + Schema name. + table : str + Table name. + field : str + Field/attribute name. + primary_key : dict + Primary key values. + ext : str, optional + File extension (e.g., ".npy", ".zarr"). + + Returns + ------- + tuple[str, str] + ``(path, token)`` where path is the storage path and token + is a unique identifier. + """ + from .storage import build_object_path + + return build_object_path( + schema=schema, + table=table, + field=field, + primary_key=primary_key, + ext=ext, + ) + + def _get_backend(self, store_name: str | None = None): + """ + Get storage backend by name. + + Parameters + ---------- + store_name : str, optional + Store name. If None, returns default store. + + Returns + ------- + StorageBackend + Storage backend instance. + """ + from .content_registry import get_store_backend + + return get_store_backend(store_name) + + +# ============================================================================= +# Object Codec (Schema-Addressed Files/Folders) +# ============================================================================= + + +class ObjectCodec(SchemaCodec): + """ + Schema-addressed storage for files and folders. + + The ```` codec provides managed file/folder storage using + schema-addressed paths: ``{schema}/{table}/{pk}/{field}/``. This creates + a browsable organization in object storage that mirrors the database schema. + + Unlike hash-addressed storage (````), each row has its own path + and content is deleted when the row is deleted. Ideal for: - Zarr arrays (hierarchical chunked data) - HDF5 files - Complex multi-file outputs - Any content that shouldn't be deduplicated - External only - requires @ modifier. + Store only - requires @ modifier. Example:: @@ -258,24 +419,24 @@ def make(self, key): {store_root}/{schema}/{table}/{pk}/{field}/ - Comparison with ````:: + Comparison with hash-addressed:: - | Aspect | | | - |----------------|-------------------|---------------------| - | Addressing | Path (by PK) | Hash (by content) | - | Deduplication | No | Yes | - | Deletion | With row | GC when unreferenced| - | Use case | Zarr, HDF5 | Blobs, attachments | + | Aspect | | | + |----------------|---------------------|---------------------| + | Addressing | Schema-addressed | Hash-addressed | + | Deduplication | No | Yes | + | Deletion | With row | GC when unreferenced| + | Use case | Zarr, HDF5 | Blobs, attachments | + + See Also + -------- + SchemaCodec : Base class for schema-addressed codecs. + NpyCodec : Schema-addressed storage for numpy arrays. + HashCodec : Hash-addressed storage with deduplication. """ name = "object" - def get_dtype(self, is_external: bool) -> str: - """Object storage is external only.""" - if not is_external: - raise DataJointError(" requires @ (external storage only)") - return "json" - def encode( self, value: Any, @@ -304,15 +465,8 @@ def encode( from datetime import datetime, timezone from pathlib import Path - from .content_registry import get_store_backend - from .storage import build_object_path - - # Extract context from key - key = key or {} - schema = key.pop("_schema", "unknown") - table = key.pop("_table", "unknown") - field = key.pop("_field", "data") - primary_key = {k: v for k, v in key.items() if not k.startswith("_")} + # Extract context using inherited helper + schema, table, field, primary_key = self._extract_context(key) # Check for pre-computed metadata (from staged insert) if isinstance(value, dict) and "path" in value: @@ -353,17 +507,11 @@ def encode( else: raise TypeError(f" expects bytes or path, got {type(value).__name__}") - # Build storage path - path, token = build_object_path( - schema=schema, - table=table, - field=field, - primary_key=primary_key, - ext=ext, - ) + # Build storage path using inherited helper + path, token = self._build_path(schema, table, field, primary_key, ext=ext) - # Get storage backend - backend = get_store_backend(store_name) + # Get storage backend using inherited helper + backend = self._get_backend(store_name) # Upload content if is_dir: @@ -406,10 +554,8 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: Handle for accessing the stored content. """ from .objectref import ObjectRef - from .content_registry import get_store_backend - store_name = stored.get("store") - backend = get_store_backend(store_name) + backend = self._get_backend(stored.get("store")) return ObjectRef.from_json(stored, backend=backend) def validate(self, value: Any) -> None: @@ -472,9 +618,9 @@ class Documents(dj.Manual): name = "attach" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """Return bytes for internal, for external storage.""" - return "" if is_external else "bytes" + return "" if is_store else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: """ @@ -610,9 +756,9 @@ class Recordings(dj.Manual): name = "filepath" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """Filepath is external only.""" - if not is_external: + if not is_store: raise DataJointError(" requires @store") return "json" @@ -688,3 +834,323 @@ def validate(self, value: Any) -> None: if not isinstance(value, (str, Path)): raise TypeError(f" expects a path string or Path, got {type(value).__name__}") + + +# ============================================================================= +# NumPy Array Codec (.npy format) +# ============================================================================= + + +class NpyRef: + """ + Lazy reference to a numpy array stored as a .npy file. + + This class provides metadata access without I/O and transparent + integration with numpy operations via the ``__array__`` protocol. + + Attributes + ---------- + shape : tuple[int, ...] + Array shape (from metadata, no I/O). + dtype : numpy.dtype + Array dtype (from metadata, no I/O). + path : str + Storage path within the store. + store : str or None + Store name (None for default). + + Examples + -------- + Metadata access without download:: + + ref = (Recording & key).fetch1('waveform') + print(ref.shape) # (1000, 32) - no download + print(ref.dtype) # float64 - no download + + Explicit loading:: + + arr = ref.load() # Downloads and returns np.ndarray + + Transparent numpy integration:: + + # These all trigger automatic download via __array__ + result = ref + 1 + result = np.mean(ref) + result = ref[0:100] # Slicing works too + """ + + __slots__ = ("_meta", "_backend", "_cached") + + def __init__(self, metadata: dict, backend: Any): + """ + Initialize NpyRef from metadata and storage backend. + + Parameters + ---------- + metadata : dict + JSON metadata containing path, store, dtype, shape. + backend : StorageBackend + Storage backend for file operations. + """ + self._meta = metadata + self._backend = backend + self._cached = None + + @property + def shape(self) -> tuple: + """Array shape (no I/O required).""" + return tuple(self._meta["shape"]) + + @property + def dtype(self): + """Array dtype (no I/O required).""" + import numpy as np + + return np.dtype(self._meta["dtype"]) + + @property + def ndim(self) -> int: + """Number of dimensions (no I/O required).""" + return len(self._meta["shape"]) + + @property + def size(self) -> int: + """Total number of elements (no I/O required).""" + import math + + return math.prod(self._meta["shape"]) + + @property + def nbytes(self) -> int: + """Total bytes (estimated from shape and dtype, no I/O required).""" + return self.size * self.dtype.itemsize + + @property + def path(self) -> str: + """Storage path within the store.""" + return self._meta["path"] + + @property + def store(self) -> str | None: + """Store name (None for default store).""" + return self._meta.get("store") + + @property + def is_loaded(self) -> bool: + """True if array data has been downloaded and cached.""" + return self._cached is not None + + def load(self): + """ + Download and return the array. + + Returns + ------- + numpy.ndarray + The array data. + + Notes + ----- + The array is cached after first load. Subsequent calls return + the cached copy without additional I/O. + """ + import io + + import numpy as np + + if self._cached is None: + buffer = self._backend.get_buffer(self.path) + self._cached = np.load(io.BytesIO(buffer), allow_pickle=False) + return self._cached + + def __array__(self, dtype=None): + """ + NumPy array protocol for transparent integration. + + This method is called automatically when the NpyRef is used + in numpy operations (arithmetic, ufuncs, etc.). + + Parameters + ---------- + dtype : numpy.dtype, optional + Desired output dtype. + + Returns + ------- + numpy.ndarray + The array data, optionally cast to dtype. + """ + arr = self.load() + if dtype is not None: + return arr.astype(dtype) + return arr + + def __getitem__(self, key): + """Support indexing/slicing by loading then indexing.""" + return self.load()[key] + + def __len__(self) -> int: + """Length of first dimension.""" + if not self._meta["shape"]: + raise TypeError("len() of 0-dimensional array") + return self._meta["shape"][0] + + def __repr__(self) -> str: + status = "loaded" if self.is_loaded else "not loaded" + return f"NpyRef(shape={self.shape}, dtype={self.dtype}, {status})" + + def __str__(self) -> str: + return repr(self) + + +class NpyCodec(SchemaCodec): + """ + Schema-addressed storage for numpy arrays as .npy files. + + The ```` codec stores numpy arrays as standard ``.npy`` files + using schema-addressed paths: ``{schema}/{table}/{pk}/{attribute}.npy``. + Arrays are fetched lazily via ``NpyRef``, which provides metadata access + without I/O and transparent numpy integration via ``__array__``. + + Store only - requires ``@`` modifier. + + Key Features: + - **Portable**: Standard .npy format readable by numpy, MATLAB, etc. + - **Lazy loading**: Metadata (shape, dtype) available without download + - **Transparent**: Use in numpy operations triggers automatic download + - **Safe bulk fetch**: Fetching many rows doesn't download until needed + - **Schema-addressed**: Browsable paths that mirror database structure + + Example:: + + @schema + class Recording(dj.Manual): + definition = ''' + recording_id : int + --- + waveform : # default store + spectrogram : # specific store + ''' + + # Insert - just pass the array + Recording.insert1({ + 'recording_id': 1, + 'waveform': np.random.randn(1000, 32), + }) + + # Fetch - returns NpyRef (lazy) + ref = (Recording & 'recording_id=1').fetch1('waveform') + ref.shape # (1000, 32) - no download + ref.dtype # float64 - no download + + # Use in numpy ops - downloads automatically + result = np.mean(ref, axis=0) + + # Or load explicitly + arr = ref.load() + + Storage Details: + - File format: NumPy .npy (version 1.0 or 2.0) + - Path: ``{schema}/{table}/{pk}/{attribute}.npy`` + - Database column: JSON with ``{path, store, dtype, shape}`` + + See Also + -------- + NpyRef : The lazy array reference returned on fetch. + SchemaCodec : Base class for schema-addressed codecs. + ObjectCodec : Schema-addressed storage for files/folders. + """ + + name = "npy" + + def validate(self, value: Any) -> None: + """ + Validate that value is a numpy array suitable for .npy storage. + + Parameters + ---------- + value : Any + Value to validate. + + Raises + ------ + DataJointError + If value is not a numpy array or has object dtype. + """ + import numpy as np + + if not isinstance(value, np.ndarray): + raise DataJointError(f" requires numpy.ndarray, got {type(value).__name__}") + if value.dtype == object: + raise DataJointError(" does not support object dtype arrays") + + def encode( + self, + value: Any, + *, + key: dict | None = None, + store_name: str | None = None, + ) -> dict: + """ + Serialize array to .npy and upload to storage. + + Parameters + ---------- + value : numpy.ndarray + Array to store. + key : dict, optional + Context dict with ``_schema``, ``_table``, ``_field``, + and primary key values for path construction. + store_name : str, optional + Target store. If None, uses default store. + + Returns + ------- + dict + JSON metadata: ``{path, store, dtype, shape}``. + """ + import io + + import numpy as np + + # Extract context using inherited helper + schema, table, field, primary_key = self._extract_context(key) + + # Build schema-addressed storage path + path, _ = self._build_path(schema, table, field, primary_key, ext=".npy") + + # Serialize to .npy format + buffer = io.BytesIO() + np.save(buffer, value, allow_pickle=False) + npy_bytes = buffer.getvalue() + + # Upload to storage using inherited helper + backend = self._get_backend(store_name) + backend.put_buffer(npy_bytes, path) + + # Return metadata (includes numpy-specific shape/dtype) + return { + "path": path, + "store": store_name, + "dtype": str(value.dtype), + "shape": list(value.shape), + } + + def decode(self, stored: dict, *, key: dict | None = None) -> NpyRef: + """ + Create lazy NpyRef from stored metadata. + + Parameters + ---------- + stored : dict + JSON metadata from database. + key : dict, optional + Primary key values (unused). + + Returns + ------- + NpyRef + Lazy array reference with metadata access and numpy integration. + """ + backend = self._get_backend(stored.get("store")) + return NpyRef(stored, backend) diff --git a/src/datajoint/codecs.py b/src/datajoint/codecs.py index 211308d1c..e6ab22931 100644 --- a/src/datajoint/codecs.py +++ b/src/datajoint/codecs.py @@ -11,7 +11,7 @@ class GraphCodec(dj.Codec): name = "graph" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: return "" def encode(self, graph, *, key=None, store_name=None): @@ -64,7 +64,7 @@ class Codec(ABC): >>> class GraphCodec(dj.Codec): ... name = "graph" ... - ... def get_dtype(self, is_external: bool) -> str: + ... def get_dtype(self, is_store: bool) -> str: ... return "" ... ... def encode(self, graph, *, key=None, store_name=None): @@ -120,14 +120,14 @@ def __init_subclass__(cls, *, register: bool = True, **kwargs): logger.debug(f"Registered codec <{cls.name}> from {cls.__module__}.{cls.__name__}") @abstractmethod - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """ Return the storage dtype for this codec. Parameters ---------- - is_external : bool - True if ``@`` modifier present (external storage). + is_store : bool + True if ``@`` modifier present (object store vs inline). Returns ------- @@ -138,7 +138,7 @@ def get_dtype(self, is_external: bool) -> str: Raises ------ DataJointError - If external storage not supported but requested. + If store mode not supported but requested. """ ... @@ -450,11 +450,11 @@ def resolve_dtype( codec = get_codec(type_name) chain.append(codec) - # Determine if external based on whether @ is present - is_external = effective_store is not None + # Determine if store mode based on whether @ is present + is_store = effective_store is not None # Get the inner dtype from the codec - inner_dtype = codec.get_dtype(is_external) + inner_dtype = codec.get_dtype(is_store) # Recursively resolve the inner dtype, propagating store final_dtype, inner_chain, resolved_store = resolve_dtype(inner_dtype, seen, effective_store) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index d86e90ed9..c96dc6a84 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -651,8 +651,8 @@ def substitute_special_type(match: dict, category: str, foreign_key_sql: list[st if store_name is not None: match["store"] = store_name # Determine if external storage is used (store_name is present, even if empty string for default) - is_external = store_name is not None - inner_dtype = codec.get_dtype(is_external=is_external) + is_store = store_name is not None + inner_dtype = codec.get_dtype(is_store=is_store) # If inner dtype is a codec without store, propagate the store from outer type # e.g., returns , we need to resolve as diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 96383170b..c2ca497fc 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -39,7 +39,7 @@ def __init__(self, codec_name: str): def name(self) -> str: return self._codec_name - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: raise DataJointError( f"Codec <{self._codec_name}> is not registered. Define a Codec subclass with name='{self._codec_name}'." ) @@ -450,8 +450,8 @@ def _init_from_database(self) -> None: attr["codec"] = _MissingType(codec_spec) else: # Determine if external storage based on store presence - is_external = attr.get("store") is not None - attr["type"] = attr["codec"].get_dtype(is_external=is_external) + is_store = attr.get("store") is not None + attr["type"] = attr["codec"].get_dtype(is_store=is_store) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): raise DataJointError(f"Invalid dtype '{attr['type']}' in codec <{codec_spec}>.") # Update is_blob based on resolved dtype (check both BYTES and NATIVE_BLOB patterns) diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index b72896d0a..f0e1371ad 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -8,7 +8,6 @@ from __future__ import annotations -import json import logging import re from typing import TYPE_CHECKING @@ -495,9 +494,7 @@ def _find_external_columns(schema: Schema) -> list[dict]: AND DATA_TYPE = 'binary' AND CHARACTER_MAXIMUM_LENGTH = 16 """ - columns = connection.query( - columns_query, args=(schema.database, table_name) - ).fetchall() + columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() for column_name, column_type, comment in columns: comment = comment or "" @@ -506,28 +503,32 @@ def _find_external_columns(schema: Schema) -> list[dict]: blob_match = EXTERNAL_PATTERNS["blob"].search(comment) if blob_match: store_name = blob_match.group(1) or "external" - results.append({ - "table_name": table_name, - "column_name": column_name, - "column_type": column_type, - "comment": comment, - "store_name": store_name, - "external_type": "blob", - }) + results.append( + { + "table_name": table_name, + "column_name": column_name, + "column_type": column_type, + "comment": comment, + "store_name": store_name, + "external_type": "blob", + } + ) continue # Check for external attach pattern attach_match = EXTERNAL_PATTERNS["attach"].search(comment) if attach_match: store_name = attach_match.group(1) or "external" - results.append({ - "table_name": table_name, - "column_name": column_name, - "column_type": column_type, - "comment": comment, - "store_name": store_name, - "external_type": "attach", - }) + results.append( + { + "table_name": table_name, + "column_name": column_name, + "column_type": column_type, + "comment": comment, + "store_name": store_name, + "external_type": "attach", + } + ) return results @@ -561,22 +562,22 @@ def _find_filepath_columns(schema: Schema) -> list[dict]: AND DATA_TYPE = 'varchar' AND COLUMN_COMMENT LIKE '%%:filepath%%' """ - columns = connection.query( - columns_query, args=(schema.database, table_name) - ).fetchall() + columns = connection.query(columns_query, args=(schema.database, table_name)).fetchall() for column_name, column_type, comment in columns: comment = comment or "" match = FILEPATH_PATTERN.search(comment) if match: store_name = match.group(1) or "external" - results.append({ - "table_name": table_name, - "column_name": column_name, - "column_type": column_type, - "comment": comment, - "store_name": store_name, - }) + results.append( + { + "table_name": table_name, + "column_name": column_name, + "column_type": column_type, + "comment": comment, + "store_name": store_name, + } + ) return results @@ -751,10 +752,7 @@ def migrate_external( count = connection.query(count_sql).fetchone()[0] detail["rows"] = count detail["status"] = "dry_run" - logger.info( - f"Would migrate {database}.{table_name}.{column_name}: " - f"{count} rows, store={store_name}" - ) + logger.info(f"Would migrate {database}.{table_name}.{column_name}: " f"{count} rows, store={store_name}") else: try: # Add new JSON column @@ -771,6 +769,7 @@ def migrate_external( # Get store config for URL building from .settings import config + store_config = config.get("stores", {}).get(store_name, {}) protocol = store_config.get("protocol", "file") location = store_config.get("location", "") @@ -800,16 +799,11 @@ def migrate_external( result["columns_migrated"] += 1 result["rows_migrated"] += count - logger.info( - f"Migrated {database}.{table_name}.{column_name}: " - f"{count} rows" - ) + logger.info(f"Migrated {database}.{table_name}.{column_name}: " f"{count} rows") except Exception as e: detail["status"] = "error" detail["error"] = str(e) - logger.error( - f"Failed to migrate {table_name}.{column_name}: {e}" - ) + logger.error(f"Failed to migrate {table_name}.{column_name}: {e}") raise DataJointError(f"Migration failed: {e}") from e result["details"].append(detail) @@ -897,9 +891,7 @@ def migrate_filepath( detail["action"] = "finalize" if dry_run: - logger.info( - f"Would finalize {database}.{table_name}.{column_name}" - ) + logger.info(f"Would finalize {database}.{table_name}.{column_name}") detail["status"] = "dry_run" else: try: @@ -952,14 +944,12 @@ def migrate_filepath( count = connection.query(count_sql).fetchone()[0] detail["rows"] = count detail["status"] = "dry_run" - logger.info( - f"Would migrate {database}.{table_name}.{column_name}: " - f"{count} rows" - ) + logger.info(f"Would migrate {database}.{table_name}.{column_name}: " f"{count} rows") else: try: # Get store config from .settings import config + store_config = config.get("stores", {}).get(store_name, {}) protocol = store_config.get("protocol", "file") location = store_config.get("location", "") @@ -992,10 +982,7 @@ def migrate_filepath( result["columns_migrated"] += 1 result["rows_migrated"] += count - logger.info( - f"Migrated {database}.{table_name}.{column_name}: " - f"{count} rows" - ) + logger.info(f"Migrated {database}.{table_name}.{column_name}: " f"{count} rows") except Exception as e: detail["status"] = "error" detail["error"] = str(e) diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py index 399ab1b9f..98faa83f2 100644 --- a/src/datajoint/schemas.py +++ b/src/datajoint/schemas.py @@ -216,7 +216,7 @@ def __call__(self, cls: type, *, context: dict[str, Any] | None = None) -> type: cls : type Table class to decorate. context : dict, optional - Declaration context. Supplied by spawn_missing_classes. + Declaration context. Supplied by make_classes. Returns ------- @@ -335,39 +335,39 @@ def size_on_disk(self) -> int: ).fetchone()[0] ) - def spawn_missing_classes(self, context: dict[str, Any] | None = None) -> None: + def make_classes(self, into: dict[str, Any] | None = None) -> None: """ - Create Python table classes for tables without existing classes. + Create Python table classes for tables in the schema. Introspects the database schema and creates appropriate Python classes (Lookup, Manual, Imported, Computed, Part) for tables that don't have - corresponding classes in the context. + corresponding classes in the target namespace. Parameters ---------- - context : dict, optional + into : dict, optional Namespace to place created classes into. Defaults to caller's local namespace. """ self._assert_exists() - if context is None: + if into is None: if self.context is not None: - context = self.context + into = self.context else: - # if context is missing, use the calling namespace + # if into is missing, use the calling namespace frame = inspect.currentframe().f_back - context = frame.f_locals + into = frame.f_locals del frame tables = [ row[0] for row in self.connection.query("SHOW TABLES in `%s`" % self.database) - if lookup_class_name("`{db}`.`{tab}`".format(db=self.database, tab=row[0]), context, 0) is None + if lookup_class_name("`{db}`.`{tab}`".format(db=self.database, tab=row[0]), into, 0) is None ] master_classes = (Lookup, Manual, Imported, Computed) part_tables = [] for table_name in tables: class_name = to_camel_case(table_name) - if class_name not in context: + if class_name not in into: try: cls = next(cls for cls in master_classes if re.fullmatch(cls.tier_regexp, table_name)) except StopIteration: @@ -375,19 +375,19 @@ def spawn_missing_classes(self, context: dict[str, Any] | None = None) -> None: part_tables.append(table_name) else: # declare and decorate master table classes - context[class_name] = self(type(class_name, (cls,), dict()), context=context) + into[class_name] = self(type(class_name, (cls,), dict()), context=into) # attach parts to masters for table_name in part_tables: groups = re.fullmatch(Part.tier_regexp, table_name).groupdict() class_name = to_camel_case(groups["part"]) try: - master_class = context[to_camel_case(groups["master"])] + master_class = into[to_camel_case(groups["master"])] except KeyError: raise DataJointError("The table %s does not follow DataJoint naming conventions" % table_name) part_class = type(class_name, (Part,), dict(definition=...)) part_class._master = master_class - self._decorate_table(part_class, context=context, assert_declared=True) + self._decorate_table(part_class, context=into, assert_declared=True) setattr(master_class, class_name, part_class) def drop(self, prompt: bool | None = None) -> None: @@ -830,7 +830,7 @@ def __init__( if add_objects: self.__dict__.update(add_objects) self.__dict__["schema"] = _schema - _schema.spawn_missing_classes(context=self.__dict__) + _schema.make_classes(into=self.__dict__) def list_schemas(connection: Connection | None = None) -> list[str]: diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 31f651ea6..426a00789 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a16" +__version__ = "2.0.0a17" diff --git a/tests/integration/test_codecs.py b/tests/integration/test_codecs.py index 6d160e5b5..f4ed7483a 100644 --- a/tests/integration/test_codecs.py +++ b/tests/integration/test_codecs.py @@ -39,9 +39,9 @@ def schema_codec( @pytest.fixture def local_schema(schema_codec, schema_name): - """Fixture for testing spawned classes""" + """Fixture for testing generated classes""" local_schema = dj.Schema(schema_name, connection=schema_codec.connection) - local_schema.spawn_missing_classes() + local_schema.make_classes() yield local_schema # Don't drop - schema_codec fixture handles cleanup diff --git a/tests/integration/test_npy_codec.py b/tests/integration/test_npy_codec.py new file mode 100644 index 000000000..b5438c68b --- /dev/null +++ b/tests/integration/test_npy_codec.py @@ -0,0 +1,439 @@ +""" +Tests for the NpyCodec - schema-addressed numpy array storage. + +These tests verify: +- NpyCodec encode/decode roundtrip +- NpyRef lazy loading behavior +- NpyRef metadata access without I/O +- NpyRef numpy integration via __array__ +- Schema-addressed path construction +""" + +import numpy as np +import pytest + +import datajoint as dj +from datajoint.builtin_codecs import NpyCodec, NpyRef, SchemaCodec + + +# ============================================================================= +# Test Schema Definition +# ============================================================================= + + +class Recording(dj.Manual): + definition = """ + recording_id : int + --- + waveform : + """ + + +class MultiArray(dj.Manual): + definition = """ + item_id : int + --- + small_array : + large_array : + """ + + +LOCALS_NPY = {"Recording": Recording, "MultiArray": MultiArray} + + +# ============================================================================= +# Fixtures +# ============================================================================= + + +@pytest.fixture +def schema_name(prefix): + return prefix + "_test_npy_codec" + + +@pytest.fixture +def schema_npy(connection_test, s3_creds, tmpdir, schema_name, mock_stores): + """Create schema with NpyCodec tables.""" + # mock_stores fixture sets up object_storage.stores with repo-s3, etc. + context = dict(LOCALS_NPY) + schema = dj.schema(schema_name, context=context, connection=connection_test) + schema(Recording) + schema(MultiArray) + yield schema + schema.drop() + + +# ============================================================================= +# Unit Tests (no database required) +# ============================================================================= + + +class TestNpyRefUnit: + """Unit tests for NpyRef without database.""" + + def test_npy_ref_metadata_access(self): + """NpyRef should provide metadata without I/O.""" + # Mock metadata as would be stored in JSON + metadata = { + "path": "test/recording/recording_id=1/waveform.npy", + "store": "default", + "dtype": "float64", + "shape": [1000, 32], + } + + # Create NpyRef with mock backend + class MockBackend: + def get_buffer(self, path): + raise AssertionError("Should not be called for metadata access") + + ref = NpyRef(metadata, MockBackend()) + + # These should NOT trigger I/O + assert ref.shape == (1000, 32) + assert ref.dtype == np.dtype("float64") + assert ref.ndim == 2 + assert ref.size == 32000 + assert ref.nbytes == 32000 * 8 # float64 = 8 bytes + assert ref.path == "test/recording/recording_id=1/waveform.npy" + assert ref.store == "default" + assert ref.is_loaded is False + + def test_npy_ref_repr(self): + """NpyRef repr should show shape, dtype, and load status.""" + metadata = { + "path": "test.npy", + "store": None, + "dtype": "int32", + "shape": [100], + } + + class MockBackend: + pass + + ref = NpyRef(metadata, MockBackend()) + repr_str = repr(ref) + + assert "NpyRef" in repr_str + assert "(100,)" in repr_str + assert "int32" in repr_str + assert "not loaded" in repr_str + + def test_npy_ref_len(self): + """NpyRef should support len() for first dimension.""" + metadata = {"path": "test.npy", "store": None, "dtype": "float32", "shape": [50, 10]} + + class MockBackend: + pass + + ref = NpyRef(metadata, MockBackend()) + assert len(ref) == 50 + + def test_npy_ref_len_0d_raises(self): + """NpyRef len() should raise for 0-d arrays.""" + metadata = {"path": "test.npy", "store": None, "dtype": "float32", "shape": []} + + class MockBackend: + pass + + ref = NpyRef(metadata, MockBackend()) + with pytest.raises(TypeError, match="0-dimensional"): + len(ref) + + +class TestNpyCodecUnit: + """Unit tests for NpyCodec without database.""" + + def test_codec_is_schema_codec(self): + """NpyCodec should inherit from SchemaCodec.""" + codec = NpyCodec() + assert isinstance(codec, SchemaCodec) + + def test_codec_name(self): + """NpyCodec should be registered as 'npy'.""" + codec = NpyCodec() + assert codec.name == "npy" + + def test_codec_requires_store(self): + """NpyCodec should require @ modifier.""" + codec = NpyCodec() + + # Should raise without @ + with pytest.raises(dj.DataJointError, match="requires @"): + codec.get_dtype(is_store=False) + + # Should return json with @ + assert codec.get_dtype(is_store=True) == "json" + + def test_codec_validate_requires_ndarray(self): + """NpyCodec should reject non-ndarray values.""" + codec = NpyCodec() + + # Should reject list + with pytest.raises(dj.DataJointError, match="requires numpy.ndarray"): + codec.validate([1, 2, 3]) + + # Should reject dict + with pytest.raises(dj.DataJointError, match="requires numpy.ndarray"): + codec.validate({"data": [1, 2, 3]}) + + # Should accept ndarray + codec.validate(np.array([1, 2, 3])) # No exception + + def test_codec_validate_rejects_object_dtype(self): + """NpyCodec should reject object dtype arrays.""" + codec = NpyCodec() + + obj_array = np.array([{}, []], dtype=object) + with pytest.raises(dj.DataJointError, match="object dtype"): + codec.validate(obj_array) + + +# ============================================================================= +# Integration Tests (require database + MinIO) +# ============================================================================= + + +class TestNpyCodecIntegration: + """Integration tests for NpyCodec with real storage.""" + + def test_insert_fetch_roundtrip(self, schema_npy, minio_client): + """Basic insert and fetch should preserve array data.""" + rec = Recording() + rec.delete() + + # Insert array + original = np.random.randn(100, 32).astype(np.float64) + rec.insert1({"recording_id": 1, "waveform": original}) + + # Fetch returns NpyRef + result = rec.fetch1("waveform") + assert isinstance(result, NpyRef) + + # Load and compare + loaded = result.load() + assert isinstance(loaded, np.ndarray) + np.testing.assert_array_equal(loaded, original) + + rec.delete() + + def test_npy_ref_caching(self, schema_npy, minio_client): + """NpyRef should cache loaded data.""" + rec = Recording() + rec.delete() + + original = np.array([1, 2, 3, 4, 5]) + rec.insert1({"recording_id": 1, "waveform": original}) + + ref = rec.fetch1("waveform") + + # First load + arr1 = ref.load() + assert ref.is_loaded is True + + # Second load should return same object (cached) + arr2 = ref.load() + assert arr1 is arr2 + + rec.delete() + + def test_npy_ref_array_protocol(self, schema_npy, minio_client): + """NpyRef should work transparently in numpy operations.""" + rec = Recording() + rec.delete() + + original = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) + rec.insert1({"recording_id": 1, "waveform": original}) + + ref = rec.fetch1("waveform") + + # __array__ is triggered by numpy functions, not Python operators + # Use np.asarray() or pass to numpy functions + result = np.asarray(ref) + 1 + np.testing.assert_array_equal(result, original + 1) + + result = np.mean(ref) + assert result == np.mean(original) + + result = np.asarray(ref) + np.testing.assert_array_equal(result, original) + + # Also test that numpy ufuncs work + result = np.add(ref, 1) + np.testing.assert_array_equal(result, original + 1) + + rec.delete() + + def test_npy_ref_indexing(self, schema_npy, minio_client): + """NpyRef should support indexing/slicing.""" + rec = Recording() + rec.delete() + + original = np.arange(100).reshape(10, 10) + rec.insert1({"recording_id": 1, "waveform": original}) + + ref = rec.fetch1("waveform") + + # Indexing + assert ref[0, 0] == 0 + assert ref[5, 5] == 55 + + # Slicing + np.testing.assert_array_equal(ref[0:2], original[0:2]) + np.testing.assert_array_equal(ref[:, 0], original[:, 0]) + + rec.delete() + + def test_bulk_fetch_lazy(self, schema_npy, minio_client): + """Fetching via to_dicts should return NpyRefs that are lazy.""" + rec = Recording() + rec.delete() + + # Insert multiple arrays + for i in range(5): + rec.insert1({"recording_id": i, "waveform": np.random.randn(10, 10)}) + + # Fetch all using to_dicts - should return NpyRefs + results = rec.to_dicts() + assert len(results) == 5 + + refs = [r["waveform"] for r in results] + for ref in refs: + assert isinstance(ref, NpyRef) + assert ref.is_loaded is False # Not loaded yet + + # Access metadata without loading + shapes = [ref.shape for ref in refs] + assert all(s == (10, 10) for s in shapes) + assert all(not ref.is_loaded for ref in refs) # Still not loaded + + # Now load one + refs[0].load() + assert refs[0].is_loaded is True + assert not refs[1].is_loaded # Others still not loaded + + rec.delete() + + def test_different_dtypes(self, schema_npy, minio_client): + """NpyCodec should handle various numpy dtypes.""" + rec = Recording() + rec.delete() + + test_cases = [ + (1, np.array([1, 2, 3], dtype=np.int32)), + (2, np.array([1.0, 2.0, 3.0], dtype=np.float32)), + (3, np.array([1.0, 2.0, 3.0], dtype=np.float64)), + (4, np.array([True, False, True], dtype=np.bool_)), + (5, np.array([1 + 2j, 3 + 4j], dtype=np.complex128)), + ] + + for rec_id, arr in test_cases: + rec.insert1({"recording_id": rec_id, "waveform": arr}) + + for rec_id, original in test_cases: + ref = (rec & f"recording_id={rec_id}").fetch1("waveform") + loaded = ref.load() + assert loaded.dtype == original.dtype + np.testing.assert_array_equal(loaded, original) + + rec.delete() + + def test_multidimensional_arrays(self, schema_npy, minio_client): + """NpyCodec should handle various array shapes.""" + rec = Recording() + rec.delete() + + test_cases = [ + (1, np.array([1, 2, 3])), # 1D + (2, np.array([[1, 2], [3, 4]])), # 2D + (3, np.random.randn(2, 3, 4)), # 3D + (4, np.random.randn(2, 3, 4, 5)), # 4D + (5, np.array(42)), # 0D scalar + ] + + for rec_id, arr in test_cases: + rec.insert1({"recording_id": rec_id, "waveform": arr}) + + for rec_id, original in test_cases: + ref = (rec & f"recording_id={rec_id}").fetch1("waveform") + assert ref.shape == original.shape + assert ref.ndim == original.ndim + loaded = ref.load() + np.testing.assert_array_equal(loaded, original) + + rec.delete() + + def test_schema_addressed_path(self, schema_npy, minio_client): + """NpyCodec should store files with .npy extension.""" + rec = Recording() + rec.delete() + + rec.insert1({"recording_id": 42, "waveform": np.array([1, 2, 3])}) + + ref = rec.fetch1("waveform") + path = ref.path + + # Path should end with .npy extension + assert path.endswith(".npy"), f"Path should end with .npy, got: {path}" + + # Verify the file can be loaded + arr = ref.load() + np.testing.assert_array_equal(arr, np.array([1, 2, 3])) + + rec.delete() + + +class TestNpyCodecEdgeCases: + """Edge case tests for NpyCodec.""" + + def test_empty_array(self, schema_npy, minio_client): + """NpyCodec should handle empty arrays.""" + rec = Recording() + rec.delete() + + empty = np.array([]) + rec.insert1({"recording_id": 1, "waveform": empty}) + + ref = rec.fetch1("waveform") + assert ref.shape == (0,) + assert ref.size == 0 + + loaded = ref.load() + np.testing.assert_array_equal(loaded, empty) + + rec.delete() + + def test_large_array(self, schema_npy, minio_client): + """NpyCodec should handle large arrays.""" + rec = Recording() + rec.delete() + + # 10MB array + large = np.random.randn(1000, 1000).astype(np.float64) + rec.insert1({"recording_id": 1, "waveform": large}) + + ref = rec.fetch1("waveform") + assert ref.shape == (1000, 1000) + assert ref.nbytes == 8_000_000 + + loaded = ref.load() + np.testing.assert_array_equal(loaded, large) + + rec.delete() + + def test_structured_array(self, schema_npy, minio_client): + """NpyCodec should handle structured arrays.""" + rec = Recording() + rec.delete() + + dt = np.dtype([("x", np.float64), ("y", np.float64), ("label", "U10")]) + structured = np.array([(1.0, 2.0, "a"), (3.0, 4.0, "b")], dtype=dt) + + rec.insert1({"recording_id": 1, "waveform": structured}) + + ref = rec.fetch1("waveform") + loaded = ref.load() + + assert loaded.dtype == structured.dtype + np.testing.assert_array_equal(loaded, structured) + + rec.delete() diff --git a/tests/integration/test_privileges.py b/tests/integration/test_privileges.py index 0939823a0..763e7c04b 100644 --- a/tests/integration/test_privileges.py +++ b/tests/integration/test_privileges.py @@ -81,11 +81,11 @@ def test_fail_create_schema(self, connection_djview): def test_insert_failure(self, connection_djview, schema_any): unprivileged = dj.Schema(schema_any.database, namespace, connection=connection_djview) - unprivileged.spawn_missing_classes() + unprivileged.make_classes() UnprivilegedLanguage = namespace["Language"] assert issubclass(UnprivilegedLanguage, dj.Lookup) and len(UnprivilegedLanguage()) == len( schema.Language() - ), "failed to spawn missing classes" + ), "failed to make classes" with pytest.raises(dj.DataJointError): UnprivilegedLanguage().insert1(("Socrates", "Greek")) diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 8cf231bf5..6ef615466 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -29,7 +29,7 @@ def schema_empty_module(schema_any, schema_empty): """ Mock the module tests_old.schema_empty. The test `test_namespace_population` will check that the module contains all the - classes in schema_any, after running `spawn_missing_classes`. + classes in schema_any, after running `make_classes`. """ namespace_dict = { "_": schema_any, @@ -51,7 +51,7 @@ def schema_empty(connection_test, schema_any, prefix): schema_empty = dj.Schema(prefix + "_test1", context=context, connection=connection_test) schema_empty(Ephys) # load the rest of the classes - schema_empty.spawn_missing_classes(context=context) + schema_empty.make_classes(into=context) yield schema_empty # Don't drop the schema since schema_any still needs it @@ -77,12 +77,12 @@ def test_drop_unauthorized(connection_test): def test_namespace_population(schema_empty_module): """ With the schema_empty_module fixture, this test - mimics the behavior of `spawn_missing_classes`, as if the schema - was declared in a separate module and `spawn_missing_classes` was called in that namespace. + mimics the behavior of `make_classes`, as if the schema + was declared in a separate module and `make_classes` was called in that namespace. """ - # Spawn missing classes in the caller's (self) namespace. + # Create classes in the caller's (self) namespace. schema_empty_module.schema.context = None - schema_empty_module.schema.spawn_missing_classes(context=None) + schema_empty_module.schema.make_classes(into=None) # Then add them to the mock module's namespace. for k, v in locals().items(): if inspect.isclass(v): diff --git a/tests/schema_codecs.py b/tests/schema_codecs.py index 6a8d478d4..97307f985 100644 --- a/tests/schema_codecs.py +++ b/tests/schema_codecs.py @@ -10,7 +10,7 @@ class GraphCodec(dj.Codec): name = "graph" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """Chain to blob for serialization.""" return "" @@ -29,7 +29,7 @@ class LayoutCodec(dj.Codec): name = "layout" - def get_dtype(self, is_external: bool) -> str: + def get_dtype(self, is_store: bool) -> str: """Chain to blob for serialization.""" return "" diff --git a/tests/unit/test_codecs.py b/tests/unit/test_codecs.py index ada626748..9e0460ca6 100644 --- a/tests/unit/test_codecs.py +++ b/tests/unit/test_codecs.py @@ -368,8 +368,8 @@ def test_blob_properties(self): """Test BlobCodec properties.""" blob_codec = get_codec("blob") assert blob_codec.name == "blob" - assert blob_codec.get_dtype(is_external=False) == "bytes" - assert blob_codec.get_dtype(is_external=True) == "" + assert blob_codec.get_dtype(is_store=False) == "bytes" + assert blob_codec.get_dtype(is_store=True) == "" def test_blob_encode_decode_roundtrip(self): """Test that encode/decode is a proper roundtrip.""" From 588751b845ac32183e33e4ebe02f075ac7fb155d Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 16:29:37 -0600 Subject: [PATCH 03/10] fix: Pass schema context to codec encode for schema-addressed paths The SchemaCodec (used by NpyCodec and ObjectCodec) needs _schema, _table, _field, and primary key values to construct schema-addressed storage paths. Previously, key=None was passed, resulting in "unknown/unknown" paths. Now builds proper context dict from table metadata and row values, enabling navigable paths like: {schema}/{table}/objects/{pk_path}/{attribute}.npy Co-Authored-By: Claude Opus 4.5 --- src/datajoint/table.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/datajoint/table.py b/src/datajoint/table.py index 0040943c5..4fa0599d8 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -1177,6 +1177,19 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): # Resolve full type chain _, type_chain, resolved_store = resolve_dtype(f"<{attr.codec.name}>", store_name=attr.store) + # Build context dict for schema-addressed codecs + # Include _schema, _table, _field, and primary key values + context = { + "_schema": self.database, + "_table": self.table_name, + "_field": name, + } + # Add primary key values from row if available + if row is not None: + for pk_name in self.primary_key: + if pk_name in row: + context[pk_name] = row[pk_name] + # Apply encoders from outermost to innermost for attr_type in type_chain: # Pass store_name to encoders that support it (check via introspection) @@ -1184,9 +1197,9 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): sig = inspect.signature(attr_type.encode) if "store_name" in sig.parameters: - value = attr_type.encode(value, key=None, store_name=resolved_store) + value = attr_type.encode(value, key=context, store_name=resolved_store) else: - value = attr_type.encode(value, key=None) + value = attr_type.encode(value, key=context) # Handle NULL values if value is None or (attr.numeric and (value == "" or np.isnan(float(value)))): From 9f6826efb2a793b7eb833333bcedbdb7d16476f2 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 16:55:29 -0600 Subject: [PATCH 04/10] chore: Merge enhance/blob-preview-display and bump to 2.0.0a18 Merge PR #1330 (blob preview display) into feature/npy-codec. Bump version from 2.0.0a17 to 2.0.0a18. Co-Authored-By: Claude Opus 4.5 --- src/datajoint/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 426a00789..cc70cfae4 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a17" +__version__ = "2.0.0a18" From acfaf0e858ade3278db9455e850e3f45cec5f3d3 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 17:00:52 -0600 Subject: [PATCH 05/10] fix: Raise error instead of returning 'bytes' for missing field Address reviewer feedback from PR #1330: attr should never be None since field_name comes from heading.names. Raising an error surfaces bugs immediately rather than silently returning a misleading placeholder. Co-Authored-By: Claude Opus 4.5 --- src/datajoint/preview.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datajoint/preview.py b/src/datajoint/preview.py index c0f103eb1..1710bff6b 100644 --- a/src/datajoint/preview.py +++ b/src/datajoint/preview.py @@ -26,9 +26,11 @@ def _format_object_display(json_data): def _get_blob_placeholder(heading, field_name, html_escape=False): """Get display placeholder for a blob/json field based on its codec.""" + from .errors import DataJointError + attr = heading.attributes.get(field_name) if attr is None: - return "bytes" + raise DataJointError(f"Field '{field_name}' not found in heading") if attr.codec is not None: name = attr.codec.name if html_escape: From 12ea8140f017698acd4143db08861611a9e58472 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 18:09:29 -0600 Subject: [PATCH 06/10] feat: Add mmap_mode parameter to NpyRef.load() Support memory-mapped loading for large arrays: - Local filesystem stores: mmap directly, no download - Remote stores: download to cache, then mmap Co-Authored-By: Claude Opus 4.5 --- src/datajoint/builtin_codecs.py | 67 ++++++++++++++++++++++++---- tests/integration/test_npy_codec.py | 68 +++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 9 deletions(-) diff --git a/src/datajoint/builtin_codecs.py b/src/datajoint/builtin_codecs.py index 499fec846..ff1977242 100644 --- a/src/datajoint/builtin_codecs.py +++ b/src/datajoint/builtin_codecs.py @@ -940,28 +940,77 @@ def is_loaded(self) -> bool: """True if array data has been downloaded and cached.""" return self._cached is not None - def load(self): + def load(self, mmap_mode=None): """ Download and return the array. + Parameters + ---------- + mmap_mode : str, optional + Memory-map mode for lazy, random-access loading of large arrays: + + - ``'r'``: Read-only + - ``'r+'``: Read-write + - ``'c'``: Copy-on-write (changes not saved to disk) + + If None (default), loads entire array into memory. + Returns ------- - numpy.ndarray - The array data. + numpy.ndarray or numpy.memmap + The array data. Returns ``numpy.memmap`` if mmap_mode is specified. Notes ----- - The array is cached after first load. Subsequent calls return - the cached copy without additional I/O. + When ``mmap_mode`` is None, the array is cached after first load. + + For local filesystem stores, memory mapping accesses the file directly + with no download. For remote stores (S3, etc.), the file is downloaded + to a local cache (``{tempdir}/datajoint_mmap/``) before memory mapping. + + Examples + -------- + Standard loading:: + + arr = ref.load() # Loads entire array into memory + + Memory-mapped for random access to large arrays:: + + arr = ref.load(mmap_mode='r') + slice = arr[1000:2000] # Only reads the needed portion from disk """ import io import numpy as np - if self._cached is None: - buffer = self._backend.get_buffer(self.path) - self._cached = np.load(io.BytesIO(buffer), allow_pickle=False) - return self._cached + if mmap_mode is None: + # Standard loading with caching + if self._cached is None: + buffer = self._backend.get_buffer(self.path) + self._cached = np.load(io.BytesIO(buffer), allow_pickle=False) + return self._cached + else: + # Memory-mapped loading + if self._backend.protocol == "file": + # Local filesystem - mmap directly, no download needed + local_path = self._backend._full_path(self.path) + return np.load(local_path, mmap_mode=mmap_mode, allow_pickle=False) + else: + # Remote storage - download to local cache first + import hashlib + import tempfile + from pathlib import Path + + path_hash = hashlib.md5(self.path.encode()).hexdigest()[:12] + cache_dir = Path(tempfile.gettempdir()) / "datajoint_mmap" + cache_dir.mkdir(exist_ok=True) + cache_path = cache_dir / f"{path_hash}.npy" + + if not cache_path.exists(): + buffer = self._backend.get_buffer(self.path) + cache_path.write_bytes(buffer) + + return np.load(str(cache_path), mmap_mode=mmap_mode, allow_pickle=False) def __array__(self, dtype=None): """ diff --git a/tests/integration/test_npy_codec.py b/tests/integration/test_npy_codec.py index b5438c68b..70e3e098a 100644 --- a/tests/integration/test_npy_codec.py +++ b/tests/integration/test_npy_codec.py @@ -139,6 +139,74 @@ class MockBackend: with pytest.raises(TypeError, match="0-dimensional"): len(ref) + def test_npy_ref_mmap_local_filesystem(self, tmp_path): + """NpyRef mmap_mode should work directly on local filesystem.""" + # Create a real .npy file + test_array = np.arange(100, dtype=np.float64) + npy_path = tmp_path / "test.npy" + np.save(npy_path, test_array) + + metadata = { + "path": "test.npy", + "store": None, + "dtype": "float64", + "shape": [100], + } + + # Mock backend that simulates local filesystem + class MockFileBackend: + protocol = "file" + + def _full_path(self, path): + return str(tmp_path / path) + + def get_buffer(self, path): + return (tmp_path / path).read_bytes() + + ref = NpyRef(metadata, MockFileBackend()) + + # Load with mmap_mode + mmap_arr = ref.load(mmap_mode="r") + + # Should be a memmap + assert isinstance(mmap_arr, np.memmap) + np.testing.assert_array_equal(mmap_arr, test_array) + + # Standard load should still work and cache + regular_arr = ref.load() + assert isinstance(regular_arr, np.ndarray) + assert not isinstance(regular_arr, np.memmap) + np.testing.assert_array_equal(regular_arr, test_array) + + def test_npy_ref_mmap_remote_storage(self, tmp_path): + """NpyRef mmap_mode should download to cache for remote storage.""" + # Create test data + test_array = np.array([1, 2, 3, 4, 5], dtype=np.int32) + npy_buffer = np.save(tmp_path / "temp.npy", test_array) + npy_bytes = (tmp_path / "temp.npy").read_bytes() + + metadata = { + "path": "remote/path/data.npy", + "store": "s3-store", + "dtype": "int32", + "shape": [5], + } + + # Mock backend that simulates remote storage + class MockS3Backend: + protocol = "s3" + + def get_buffer(self, path): + return npy_bytes + + ref = NpyRef(metadata, MockS3Backend()) + + # Load with mmap_mode - should download to cache + mmap_arr = ref.load(mmap_mode="r") + + assert isinstance(mmap_arr, np.memmap) + np.testing.assert_array_equal(mmap_arr, test_array) + class TestNpyCodecUnit: """Unit tests for NpyCodec without database.""" From c02a8826a5d7e740573f90d77c5ef0410e10cb72 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Mon, 12 Jan 2026 18:21:01 -0600 Subject: [PATCH 07/10] fix: Remove unused variable in mmap test --- tests/integration/test_npy_codec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_npy_codec.py b/tests/integration/test_npy_codec.py index 70e3e098a..bf8a8bcac 100644 --- a/tests/integration/test_npy_codec.py +++ b/tests/integration/test_npy_codec.py @@ -182,7 +182,7 @@ def test_npy_ref_mmap_remote_storage(self, tmp_path): """NpyRef mmap_mode should download to cache for remote storage.""" # Create test data test_array = np.array([1, 2, 3, 4, 5], dtype=np.int32) - npy_buffer = np.save(tmp_path / "temp.npy", test_array) + np.save(tmp_path / "temp.npy", test_array) npy_bytes = (tmp_path / "temp.npy").read_bytes() metadata = { From 0d1ffe7b822cfb3ed30fc6060dd966d693131ec3 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 13 Jan 2026 13:23:35 -0600 Subject: [PATCH 08/10] refactor: Rename content_registry to hash_registry with path-based storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major changes to hash-addressed storage model: - Rename content_registry.py → hash_registry.py for clarity - Always store full path in metadata (protects against config changes) - Use stored path directly for retrieval (no path regeneration) - Add delete_path() as primary function, deprecate delete_hash() - Add get_size() as primary function, deprecate get_hash_size() - Update gc.py to work with paths instead of hashes - Update builtin_codecs.py HashCodec to use new API This design enables seamless migration from v0.14: - Legacy data keeps old paths in metadata - New data uses new path structure - GC compares stored paths against filesystem Co-Authored-By: Claude Opus 4.5 --- .gitignore | 2 +- pyproject.toml | 2 +- src/datajoint/builtin_codecs.py | 55 +-- src/datajoint/content_registry.py | 231 ------------ src/datajoint/gc.py | 403 +++++++++++---------- src/datajoint/hash_registry.py | 415 ++++++++++++++++++++++ tests/integration/test_content_storage.py | 231 ------------ tests/integration/test_gc.py | 208 +++++------ tests/integration/test_hash_storage.py | 304 ++++++++++++++++ 9 files changed, 1080 insertions(+), 771 deletions(-) delete mode 100644 src/datajoint/content_registry.py create mode 100644 src/datajoint/hash_registry.py delete mode 100644 tests/integration/test_content_storage.py create mode 100644 tests/integration/test_hash_storage.py diff --git a/.gitignore b/.gitignore index 3c88c420c..5079dca62 100644 --- a/.gitignore +++ b/.gitignore @@ -187,7 +187,7 @@ dj_local_conf.json !.vscode/launch.json # pixi environments .pixi -_content/ +_hash/ # Local config .secrets/ diff --git a/pyproject.toml b/pyproject.toml index ef9e622a2..7cd06d786 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -168,7 +168,7 @@ check_untyped_defs = true # Modules with complete type coverage - strict checking enabled [[tool.mypy.overrides]] module = [ - "datajoint.content_registry", + "datajoint.hash_registry", "datajoint.errors", "datajoint.hash", ] diff --git a/src/datajoint/builtin_codecs.py b/src/datajoint/builtin_codecs.py index ff1977242..96c95b7b9 100644 --- a/src/datajoint/builtin_codecs.py +++ b/src/datajoint/builtin_codecs.py @@ -7,8 +7,8 @@ Built-in Codecs: - ````: Serialize Python objects (internal) or external with dedup - - ````: Hash-addressed storage with MD5 deduplication - - ````: Path-addressed storage for files/folders (Zarr, HDF5) + - ````: Hash-addressed storage with SHA256 deduplication + - ````: Schema-addressed storage for files/folders (Zarr, HDF5) - ````: File attachment (internal) or external with dedup - ````: Reference to existing file in store - ````: Store numpy arrays as portable .npy files (external only) @@ -127,14 +127,16 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> Any: class HashCodec(Codec): """ - Hash-addressed storage with MD5 deduplication. + Hash-addressed storage with SHA256 deduplication. - The ```` codec stores raw bytes using content-addressed storage. - Data is identified by its MD5 hash and stored in a hierarchical directory: + The ```` codec stores raw bytes using hash-addressed storage. + Data is identified by its SHA256 hash and stored in a hierarchical directory: ``_hash/{hash[:2]}/{hash[2:4]}/{hash}`` The database column stores JSON metadata: ``{hash, store, size}``. - Duplicate content is automatically deduplicated. + Duplicate content is automatically deduplicated across all tables. + + Deletion: Requires garbage collection via ``dj.gc.collect()``. External only - requires @ modifier. @@ -154,6 +156,10 @@ class RawContent(dj.Manual): Note: This codec accepts only ``bytes``. For Python objects, use ````. Typically used indirectly via ```` or ```` rather than directly. + + See Also + -------- + datajoint.gc : Garbage collection for orphaned storage. """ name = "hash" @@ -173,38 +179,39 @@ def encode(self, value: bytes, *, key: dict | None = None, store_name: str | Non value : bytes Raw bytes to store. key : dict, optional - Primary key values (unused). + Context dict with ``_schema`` for path isolation. store_name : str, optional Store to use. If None, uses default store. Returns ------- dict - Metadata dict: ``{hash, store, size}``. + Metadata dict: ``{hash, path, schema, store, size}``. """ - from .content_registry import put_content + from .hash_registry import put_hash - return put_content(value, store_name=store_name) + schema_name = (key or {}).get("_schema", "unknown") + return put_hash(value, schema_name=schema_name, store_name=store_name) def decode(self, stored: dict, *, key: dict | None = None) -> bytes: """ - Retrieve content by hash. + Retrieve content using stored metadata. Parameters ---------- stored : dict - Metadata dict with ``'hash'`` and optionally ``'store'``. + Metadata dict with ``'path'``, ``'hash'``, and optionally ``'store'``. key : dict, optional - Primary key values (unused). + Context dict (unused - path is in metadata). Returns ------- bytes Original bytes. """ - from .content_registry import get_content + from .hash_registry import get_hash - return get_content(stored["hash"], store_name=stored.get("store")) + return get_hash(stored) def validate(self, value: Any) -> None: """Validate that value is bytes.""" @@ -366,7 +373,7 @@ def _get_backend(self, store_name: str | None = None): StorageBackend Storage backend instance. """ - from .content_registry import get_store_backend + from .hash_registry import get_store_backend return get_store_backend(store_name) @@ -384,8 +391,8 @@ class ObjectCodec(SchemaCodec): schema-addressed paths: ``{schema}/{table}/{pk}/{field}/``. This creates a browsable organization in object storage that mirrors the database schema. - Unlike hash-addressed storage (````), each row has its own path - and content is deleted when the row is deleted. Ideal for: + Unlike hash-addressed storage (````), each row has its own unique path + (no deduplication). Ideal for: - Zarr arrays (hierarchical chunked data) - HDF5 files @@ -419,17 +426,20 @@ def make(self, key): {store_root}/{schema}/{table}/{pk}/{field}/ + Deletion: Requires garbage collection via ``dj.gc.collect()``. + Comparison with hash-addressed:: | Aspect | | | |----------------|---------------------|---------------------| | Addressing | Schema-addressed | Hash-addressed | | Deduplication | No | Yes | - | Deletion | With row | GC when unreferenced| + | Deletion | GC required | GC required | | Use case | Zarr, HDF5 | Blobs, attachments | See Also -------- + datajoint.gc : Garbage collection for orphaned storage. SchemaCodec : Base class for schema-addressed codecs. NpyCodec : Schema-addressed storage for numpy arrays. HashCodec : Hash-addressed storage with deduplication. @@ -782,7 +792,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None """ from datetime import datetime, timezone - from .content_registry import get_store_backend + from .hash_registry import get_store_backend path = str(value) @@ -822,7 +832,7 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: Handle for accessing the file. """ from .objectref import ObjectRef - from .content_registry import get_store_backend + from .hash_registry import get_store_backend store_name = stored.get("store") backend = get_store_backend(store_name) @@ -1103,8 +1113,11 @@ class Recording(dj.Manual): - Path: ``{schema}/{table}/{pk}/{attribute}.npy`` - Database column: JSON with ``{path, store, dtype, shape}`` + Deletion: Requires garbage collection via ``dj.gc.collect()``. + See Also -------- + datajoint.gc : Garbage collection for orphaned storage. NpyRef : The lazy array reference returned on fetch. SchemaCodec : Base class for schema-addressed codecs. ObjectCodec : Schema-addressed storage for files/folders. diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py deleted file mode 100644 index 70b38324a..000000000 --- a/src/datajoint/content_registry.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -Content-addressed storage registry for DataJoint. - -This module provides content-addressed storage with deduplication for the -Codec. Content is identified by its MD5 hash and stored in a hierarchical -directory structure: _hash/{hash[:2]}/{hash[2:4]}/{hash} - -The ContentRegistry tracks stored content for garbage collection purposes. -""" - -import hashlib -import logging -from typing import Any - -from .errors import DataJointError -from .settings import config -from .storage import StorageBackend - -logger = logging.getLogger(__name__.split(".")[0]) - - -def compute_content_hash(data: bytes) -> str: - """ - Compute SHA256 hash of content. - - Parameters - ---------- - data : bytes - Content bytes. - - Returns - ------- - str - Hex-encoded SHA256 hash (64 characters). - """ - return hashlib.sha256(data).hexdigest() - - -def build_content_path(content_hash: str) -> str: - """ - Build the storage path for content-addressed storage. - - Content is stored in a hierarchical structure to avoid too many files - in a single directory: _content/{hash[:2]}/{hash[2:4]}/{hash} - - Parameters - ---------- - content_hash : str - SHA256 hex hash (64 characters). - - Returns - ------- - str - Relative path within the store. - """ - if len(content_hash) != 64: - raise DataJointError(f"Invalid content hash length: {len(content_hash)} (expected 64)") - return f"_content/{content_hash[:2]}/{content_hash[2:4]}/{content_hash}" - - -def get_store_backend(store_name: str | None = None) -> StorageBackend: - """ - Get a StorageBackend for content storage. - - Parameters - ---------- - store_name : str, optional - Name of the store to use. If None, uses the default object storage - configuration or the configured default_store. - - Returns - ------- - StorageBackend - StorageBackend instance. - """ - # If store_name is None, check for configured default_store - if store_name is None and config.object_storage.default_store: - store_name = config.object_storage.default_store - - # get_object_store_spec handles None by returning default object_storage config - spec = config.get_object_store_spec(store_name) - return StorageBackend(spec) - - -def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: - """ - Store content using content-addressed storage. - - If the content already exists (same hash), it is not re-uploaded. - Returns metadata including the hash, store, and size. - - Parameters - ---------- - data : bytes - Content bytes to store. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - dict[str, Any] - Metadata dict with keys: hash, store, size. - """ - content_hash = compute_content_hash(data) - path = build_content_path(content_hash) - - backend = get_store_backend(store_name) - - # Check if content already exists (deduplication) - if not backend.exists(path): - backend.put_buffer(data, path) - logger.debug(f"Stored new content: {content_hash[:16]}... ({len(data)} bytes)") - else: - logger.debug(f"Content already exists: {content_hash[:16]}...") - - return { - "hash": content_hash, - "store": store_name, - "size": len(data), - } - - -def get_content(content_hash: str, store_name: str | None = None) -> bytes: - """ - Retrieve content by its hash. - - Parameters - ---------- - content_hash : str - SHA256 hex hash of the content. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - bytes - Content bytes. - - Raises - ------ - MissingExternalFile - If content is not found. - DataJointError - If hash verification fails. - """ - path = build_content_path(content_hash) - backend = get_store_backend(store_name) - - data = backend.get_buffer(path) - - # Verify hash (optional but recommended for integrity) - actual_hash = compute_content_hash(data) - if actual_hash != content_hash: - raise DataJointError(f"Content hash mismatch: expected {content_hash[:16]}..., got {actual_hash[:16]}...") - - return data - - -def content_exists(content_hash: str, store_name: str | None = None) -> bool: - """ - Check if content exists in storage. - - Parameters - ---------- - content_hash : str - SHA256 hex hash of the content. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - bool - True if content exists. - """ - path = build_content_path(content_hash) - backend = get_store_backend(store_name) - return backend.exists(path) - - -def delete_content(content_hash: str, store_name: str | None = None) -> bool: - """ - Delete content from storage. - - This should only be called after verifying no references exist. - Use garbage collection to safely remove unreferenced content. - - Parameters - ---------- - content_hash : str - SHA256 hex hash of the content. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - bool - True if content was deleted, False if it didn't exist. - - Warnings - -------- - This permanently deletes content. Ensure no references exist first. - """ - path = build_content_path(content_hash) - backend = get_store_backend(store_name) - - if backend.exists(path): - backend.remove(path) - logger.debug(f"Deleted content: {content_hash[:16]}...") - return True - return False - - -def get_content_size(content_hash: str, store_name: str | None = None) -> int: - """ - Get the size of stored content. - - Parameters - ---------- - content_hash : str - SHA256 hex hash of the content. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - int - Size in bytes. - """ - path = build_content_path(content_hash) - backend = get_store_backend(store_name) - return backend.size(path) diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index 7570e6f24..c3f2d6f0f 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -1,25 +1,37 @@ """ Garbage collection for external storage. -This module provides utilities to identify and remove orphaned content -from external storage. Content becomes orphaned when all database rows -referencing it are deleted. +This module provides utilities to identify and remove orphaned items +from external storage. Storage items become orphaned when all database rows +referencing them are deleted. -Supports two storage patterns: -- Content-addressed storage: , , - Stored at: _content/{hash[:2]}/{hash[2:4]}/{hash} +DataJoint uses two external storage patterns: -- Path-addressed storage: - Stored at: {schema}/{table}/objects/{pk}/{field}_{token}/ +Hash-addressed storage + Types: ````, ````, ```` + Path: ``_hash/{schema}/{hash}`` (with optional subfolding) + Deduplication: Per-schema (identical data within a schema shares storage) + Deletion: Requires garbage collection + +Schema-addressed storage + Types: ````, ```` + Path: ``{schema}/{table}/{pk}/{field}/`` + Deduplication: None (each entity has unique path) + Deletion: Requires garbage collection + +Usage:: -Usage: import datajoint as dj - # Scan schemas and find orphaned content + # Scan schemas and find orphaned items stats = dj.gc.scan(schema1, schema2, store_name='mystore') - # Remove orphaned content (dry_run=False to actually delete) + # Remove orphaned items (dry_run=False to actually delete) stats = dj.gc.collect(schema1, schema2, store_name='mystore', dry_run=True) + +See Also +-------- +datajoint.builtin_codecs : Codec implementations for external storage types. """ from __future__ import annotations @@ -28,7 +40,7 @@ import logging from typing import TYPE_CHECKING, Any -from .content_registry import delete_content, get_store_backend +from .hash_registry import delete_path, get_store_backend from .errors import DataJointError if TYPE_CHECKING: @@ -37,14 +49,15 @@ logger = logging.getLogger(__name__.split(".")[0]) -def _uses_content_storage(attr) -> bool: +def _uses_hash_storage(attr) -> bool: """ - Check if an attribute uses content-addressed storage. + Check if an attribute uses hash-addressed storage. - This includes types that chain to for external storage: - - directly - - (chains to ) - - (chains to ) + Hash-addressed types use content deduplication via MD5/Base32 hashing: + + - ```` - raw hash storage + - ```` - chains to ```` + - ```` - chains to ```` Parameters ---------- @@ -54,29 +67,33 @@ def _uses_content_storage(attr) -> bool: Returns ------- bool - True if the attribute stores content hashes. + True if the attribute uses hash-addressed storage. """ if not attr.codec: return False - # Check if this type uses content storage codec_name = getattr(attr.codec, "name", "") store = getattr(attr, "store", None) - # always uses content storage (external only) + # always uses hash-addressed storage (external only) if codec_name == "hash": return True - # and use content storage when external (has store) + # and use hash-addressed storage when external if codec_name in ("blob", "attach") and store is not None: return True return False -def _uses_object_storage(attr) -> bool: +def _uses_schema_storage(attr) -> bool: """ - Check if an attribute uses path-addressed object storage. + Check if an attribute uses schema-addressed storage. + + Schema-addressed types store data at paths derived from the schema structure: + + - ```` - arbitrary objects (pickled or native formats) + - ```` - NumPy arrays with lazy loading Parameters ---------- @@ -86,28 +103,31 @@ def _uses_object_storage(attr) -> bool: Returns ------- bool - True if the attribute stores object paths. + True if the attribute uses schema-addressed storage. """ if not attr.codec: return False codec_name = getattr(attr.codec, "name", "") - return codec_name == "object" + return codec_name in ("object", "npy") -def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: +def _extract_hash_refs(value: Any) -> list[tuple[str, str | None]]: """ - Extract content references from a stored value. + Extract path references from hash-addressed storage metadata. + + Hash-addressed storage stores metadata as JSON with ``path`` and ``hash`` keys. + The path is used for file operations; the hash is for integrity verification. Parameters ---------- value : Any - The stored value (could be JSON string or dict). + The stored value (JSON string or dict). Returns ------- list[tuple[str, str | None]] - List of (content_hash, store_name) tuples. + List of (path, store_name) tuples. """ refs = [] @@ -121,21 +141,23 @@ def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: except (json.JSONDecodeError, TypeError): return refs - # Extract hash from dict - if isinstance(value, dict) and "hash" in value: - refs.append((value["hash"], value.get("store"))) + # Extract path from dict (path is required for new data, hash for legacy) + if isinstance(value, dict) and "path" in value: + refs.append((value["path"], value.get("store"))) return refs -def _extract_object_refs(value: Any) -> list[tuple[str, str | None]]: +def _extract_schema_refs(value: Any) -> list[tuple[str, str | None]]: """ - Extract object path references from a stored value. + Extract schema-addressed path references from a stored value. + + Schema-addressed storage stores metadata as JSON with a ``path`` key. Parameters ---------- value : Any - The stored value (could be JSON string or dict). + The stored value (JSON string or dict). Returns ------- @@ -161,16 +183,17 @@ def _extract_object_refs(value: Any) -> list[tuple[str, str | None]]: return refs -def scan_references( +def scan_hash_references( *schemas: "Schema", store_name: str | None = None, verbose: bool = False, ) -> set[str]: """ - Scan schemas for content references. + Scan schemas for hash-addressed storage references. - Examines all tables in the given schemas and extracts content hashes - from columns that use content-addressed storage (, , ). + Examines all tables in the given schemas and extracts storage paths + from columns that use hash-addressed storage (````, ````, + ````). Parameters ---------- @@ -184,7 +207,7 @@ def scan_references( Returns ------- set[str] - Set of content hashes that are referenced. + Set of storage paths that are referenced. """ referenced: set[str] = set() @@ -198,23 +221,22 @@ def scan_references( # Get table class table = schema.get_table(table_name) - # Check each attribute for content storage + # Check each attribute for hash-addressed storage for attr_name, attr in table.heading.attributes.items(): - if not _uses_content_storage(attr): + if not _uses_hash_storage(attr): continue if verbose: logger.info(f" Scanning {table_name}.{attr_name}") # Fetch all values for this attribute - # Use to_arrays to get attribute values try: values = table.to_arrays(attr_name) for value in values: - for content_hash, ref_store in _extract_content_refs(value): + for path, ref_store in _extract_hash_refs(value): # Filter by store if specified if store_name is None or ref_store == store_name: - referenced.add(content_hash) + referenced.add(path) except Exception as e: logger.warning(f"Error scanning {table_name}.{attr_name}: {e}") @@ -224,16 +246,16 @@ def scan_references( return referenced -def scan_object_references( +def scan_schema_references( *schemas: "Schema", store_name: str | None = None, verbose: bool = False, ) -> set[str]: """ - Scan schemas for object path references. + Scan schemas for schema-addressed storage references. - Examines all tables in the given schemas and extracts object paths - from columns that use path-addressed storage (). + Examines all tables in the given schemas and extracts paths from columns + that use schema-addressed storage (````, ````). Parameters ---------- @@ -247,13 +269,13 @@ def scan_object_references( Returns ------- set[str] - Set of object paths that are referenced. + Set of storage paths that are referenced. """ referenced: set[str] = set() for schema in schemas: if verbose: - logger.info(f"Scanning schema for objects: {schema.database}") + logger.info(f"Scanning schema for schema-addressed storage: {schema.database}") # Get all tables in schema for table_name in schema.list_tables(): @@ -261,9 +283,9 @@ def scan_object_references( # Get table class table = schema.get_table(table_name) - # Check each attribute for object storage + # Check each attribute for schema-addressed storage for attr_name, attr in table.heading.attributes.items(): - if not _uses_object_storage(attr): + if not _uses_schema_storage(attr): continue if verbose: @@ -273,7 +295,7 @@ def scan_object_references( try: values = table.to_arrays(attr_name) for value in values: - for path, ref_store in _extract_object_refs(value): + for path, ref_store in _extract_schema_refs(value): # Filter by store if specified if store_name is None or ref_store == store_name: referenced.add(path) @@ -286,12 +308,13 @@ def scan_object_references( return referenced -def list_stored_content(store_name: str | None = None) -> dict[str, int]: +def list_stored_hashes(store_name: str | None = None) -> dict[str, int]: """ - List all content hashes in storage. + List all hash-addressed items in storage. - Scans the _content/ directory in the specified store and returns - all content hashes found. + Scans the ``_hash/`` directory in the specified store and returns + all storage paths found. These correspond to ````, ````, + and ```` types. Parameters ---------- @@ -301,17 +324,20 @@ def list_stored_content(store_name: str | None = None) -> dict[str, int]: Returns ------- dict[str, int] - Dict mapping content_hash to size in bytes. + Dict mapping storage path to size in bytes. """ + import re + backend = get_store_backend(store_name) stored: dict[str, int] = {} - # Content is stored at _content/{hash[:2]}/{hash[2:4]}/{hash} - content_prefix = "_content/" + # Hash-addressed storage: _hash/{schema}/{subfolders...}/{hash} + hash_prefix = "_hash/" + # Base32 pattern: 26 lowercase alphanumeric chars + base32_pattern = re.compile(r"^[a-z2-7]{26}$") try: - # List all files under _content/ - full_prefix = backend._full_path(content_prefix) + full_prefix = backend._full_path(hash_prefix) for root, dirs, files in backend.fs.walk(full_prefix): for filename in files: @@ -319,33 +345,36 @@ def list_stored_content(store_name: str | None = None) -> dict[str, int]: if filename.endswith(".manifest.json"): continue - # The filename is the full hash + # The filename is the base32 hash content_hash = filename - # Validate it looks like a hash (64 hex chars) - if len(content_hash) == 64 and all(c in "0123456789abcdef" for c in content_hash): + # Validate it looks like a base32 hash + if base32_pattern.match(content_hash): try: file_path = f"{root}/{filename}" size = backend.fs.size(file_path) - stored[content_hash] = size + # Build relative path for comparison with stored metadata + # Path format: _hash/{schema}/{subfolders...}/{hash} + relative_path = file_path.replace(backend._full_path(""), "").lstrip("/") + stored[relative_path] = size except Exception: - stored[content_hash] = 0 + pass except FileNotFoundError: - # No _content/ directory exists yet + # No _hash/ directory exists yet pass except Exception as e: - logger.warning(f"Error listing stored content: {e}") + logger.warning(f"Error listing stored hashes: {e}") return stored -def list_stored_objects(store_name: str | None = None) -> dict[str, int]: +def list_schema_paths(store_name: str | None = None) -> dict[str, int]: """ - List all object paths in storage. + List all schema-addressed items in storage. - Scans for directories matching the object storage pattern: - {schema}/{table}/objects/{pk}/{field}_{token}/ + Scans for directories matching the schema-addressed storage pattern: + ``{schema}/{table}/{pk}/{field}/`` Parameters ---------- @@ -355,55 +384,57 @@ def list_stored_objects(store_name: str | None = None) -> dict[str, int]: Returns ------- dict[str, int] - Dict mapping object_path to size in bytes. + Dict mapping storage path to size in bytes. """ backend = get_store_backend(store_name) stored: dict[str, int] = {} try: - # Walk the storage looking for /objects/ directories + # Walk the storage looking for schema-addressed paths full_prefix = backend._full_path("") for root, dirs, files in backend.fs.walk(full_prefix): - # Skip _content directory - if "_content" in root: + # Skip _hash directory (hash-addressed storage) + if "_hash" in root: continue - # Look for "objects" directory pattern - if "/objects/" in root: - # This could be an object storage path - # Path pattern: {schema}/{table}/objects/{pk}/{field}_{token} - relative_path = root.replace(full_prefix, "").lstrip("/") + # Look for schema-addressed pattern (has files, not in _hash) + # Schema-addressed paths: {schema}/{table}/{pk}/{field}/ + relative_path = root.replace(full_prefix, "").lstrip("/") - # Calculate total size of this object directory - total_size = 0 - for file in files: - try: - file_path = f"{root}/{file}" - total_size += backend.fs.size(file_path) - except Exception: - pass + # Skip empty paths and root-level directories + if not relative_path or relative_path.count("/") < 2: + continue + + # Calculate total size of this directory + total_size = 0 + for file in files: + try: + file_path = f"{root}/{file}" + total_size += backend.fs.size(file_path) + except Exception: + pass - # Only count directories with files (actual objects) - if total_size > 0 or files: - stored[relative_path] = total_size + # Only count directories with files (actual objects) + if total_size > 0 or files: + stored[relative_path] = total_size except FileNotFoundError: pass except Exception as e: - logger.warning(f"Error listing stored objects: {e}") + logger.warning(f"Error listing stored schemas: {e}") return stored -def delete_object(path: str, store_name: str | None = None) -> bool: +def delete_schema_path(path: str, store_name: str | None = None) -> bool: """ - Delete an object directory from storage. + Delete a schema-addressed directory from storage. Parameters ---------- path : str - Object path (relative to store root). + Storage path (relative to store root). store_name : str, optional Store name (None = default store). @@ -419,10 +450,10 @@ def delete_object(path: str, store_name: str | None = None) -> bool: if backend.fs.exists(full_path): # Remove entire directory tree backend.fs.rm(full_path, recursive=True) - logger.debug(f"Deleted object: {path}") + logger.debug(f"Deleted schema path: {path}") return True except Exception as e: - logger.warning(f"Error deleting object {path}: {e}") + logger.warning(f"Error deleting schema path {path}: {e}") return False @@ -433,10 +464,10 @@ def scan( verbose: bool = False, ) -> dict[str, Any]: """ - Scan for orphaned content and objects without deleting. + Scan for orphaned storage items without deleting. - Scans both content-addressed storage (for , , ) - and path-addressed storage (for ). + Scans both hash-addressed storage (for ````, ````, ````) + and schema-addressed storage (for ````, ````). Parameters ---------- @@ -452,50 +483,50 @@ def scan( dict[str, Any] Dict with scan statistics: - - content_referenced: Number of content items referenced in database - - content_stored: Number of content items in storage - - content_orphaned: Number of unreferenced content items - - content_orphaned_bytes: Total size of orphaned content + - hash_referenced: Number of hash items referenced in database + - hash_stored: Number of hash items in storage + - hash_orphaned: Number of unreferenced hash items + - hash_orphaned_bytes: Total size of orphaned hashes - orphaned_hashes: List of orphaned content hashes - - object_referenced: Number of objects referenced in database - - object_stored: Number of objects in storage - - object_orphaned: Number of unreferenced objects - - object_orphaned_bytes: Total size of orphaned objects - - orphaned_paths: List of orphaned object paths + - schema_paths_referenced: Number of schema items referenced in database + - schema_paths_stored: Number of schema items in storage + - schema_paths_orphaned: Number of unreferenced schema items + - schema_paths_orphaned_bytes: Total size of orphaned schema items + - orphaned_paths: List of orphaned schema paths """ if not schemas: raise DataJointError("At least one schema must be provided") - # --- Content-addressed storage --- - content_referenced = scan_references(*schemas, store_name=store_name, verbose=verbose) - content_stored = list_stored_content(store_name) - orphaned_hashes = set(content_stored.keys()) - content_referenced - content_orphaned_bytes = sum(content_stored.get(h, 0) for h in orphaned_hashes) + # --- Hash-addressed storage --- + hash_referenced = scan_hash_references(*schemas, store_name=store_name, verbose=verbose) + hash_stored = list_stored_hashes(store_name) + orphaned_hashes = set(hash_stored.keys()) - hash_referenced + hash_orphaned_bytes = sum(hash_stored.get(h, 0) for h in orphaned_hashes) - # --- Path-addressed storage (objects) --- - object_referenced = scan_object_references(*schemas, store_name=store_name, verbose=verbose) - object_stored = list_stored_objects(store_name) - orphaned_paths = set(object_stored.keys()) - object_referenced - object_orphaned_bytes = sum(object_stored.get(p, 0) for p in orphaned_paths) + # --- Schema-addressed storage --- + schema_paths_referenced = scan_schema_references(*schemas, store_name=store_name, verbose=verbose) + schema_paths_stored = list_schema_paths(store_name) + orphaned_paths = set(schema_paths_stored.keys()) - schema_paths_referenced + schema_paths_orphaned_bytes = sum(schema_paths_stored.get(p, 0) for p in orphaned_paths) return { - # Content-addressed storage stats - "content_referenced": len(content_referenced), - "content_stored": len(content_stored), - "content_orphaned": len(orphaned_hashes), - "content_orphaned_bytes": content_orphaned_bytes, + # Hash-addressed storage stats + "hash_referenced": len(hash_referenced), + "hash_stored": len(hash_stored), + "hash_orphaned": len(orphaned_hashes), + "hash_orphaned_bytes": hash_orphaned_bytes, "orphaned_hashes": sorted(orphaned_hashes), - # Path-addressed storage stats - "object_referenced": len(object_referenced), - "object_stored": len(object_stored), - "object_orphaned": len(orphaned_paths), - "object_orphaned_bytes": object_orphaned_bytes, + # Schema-addressed storage stats + "schema_paths_referenced": len(schema_paths_referenced), + "schema_paths_stored": len(schema_paths_stored), + "schema_paths_orphaned": len(orphaned_paths), + "schema_paths_orphaned_bytes": schema_paths_orphaned_bytes, "orphaned_paths": sorted(orphaned_paths), # Combined totals - "referenced": len(content_referenced) + len(object_referenced), - "stored": len(content_stored) + len(object_stored), + "referenced": len(hash_referenced) + len(schema_paths_referenced), + "stored": len(hash_stored) + len(schema_paths_stored), "orphaned": len(orphaned_hashes) + len(orphaned_paths), - "orphaned_bytes": content_orphaned_bytes + object_orphaned_bytes, + "orphaned_bytes": hash_orphaned_bytes + schema_paths_orphaned_bytes, } @@ -506,10 +537,10 @@ def collect( verbose: bool = False, ) -> dict[str, Any]: """ - Remove orphaned content and objects from storage. + Remove orphaned storage items. - Scans the given schemas for content and object references, then removes any - storage items that are not referenced. + Scans the given schemas for storage references, then removes any + items that are not referenced. Parameters ---------- @@ -530,66 +561,66 @@ def collect( - referenced: Total items referenced in database - stored: Total items in storage - orphaned: Total unreferenced items - - content_deleted: Number of content items deleted - - object_deleted: Number of object items deleted + - hash_deleted: Number of hash items deleted + - schema_paths_deleted: Number of schema items deleted - deleted: Total items deleted (0 if dry_run) - bytes_freed: Bytes freed (0 if dry_run) - errors: Number of deletion errors """ - # First scan to find orphaned content and objects + # First scan to find orphaned items stats = scan(*schemas, store_name=store_name, verbose=verbose) - content_deleted = 0 - object_deleted = 0 + hash_deleted = 0 + schema_paths_deleted = 0 bytes_freed = 0 errors = 0 if not dry_run: - # Delete orphaned content (hash-addressed) - if stats["content_orphaned"] > 0: - content_stored = list_stored_content(store_name) + # Delete orphaned hashes + if stats["hash_orphaned"] > 0: + hash_stored = list_stored_hashes(store_name) - for content_hash in stats["orphaned_hashes"]: + for path in stats["orphaned_hashes"]: try: - size = content_stored.get(content_hash, 0) - if delete_content(content_hash, store_name): - content_deleted += 1 + size = hash_stored.get(path, 0) + if delete_path(path, store_name): + hash_deleted += 1 bytes_freed += size if verbose: - logger.info(f"Deleted content: {content_hash[:16]}... ({size} bytes)") + logger.info(f"Deleted: {path} ({size} bytes)") except Exception as e: errors += 1 - logger.warning(f"Failed to delete content {content_hash[:16]}...: {e}") + logger.warning(f"Failed to delete {path}: {e}") - # Delete orphaned objects (path-addressed) - if stats["object_orphaned"] > 0: - object_stored = list_stored_objects(store_name) + # Delete orphaned schema paths + if stats["schema_paths_orphaned"] > 0: + schema_paths_stored = list_schema_paths(store_name) for path in stats["orphaned_paths"]: try: - size = object_stored.get(path, 0) - if delete_object(path, store_name): - object_deleted += 1 + size = schema_paths_stored.get(path, 0) + if delete_schema_path(path, store_name): + schema_paths_deleted += 1 bytes_freed += size if verbose: - logger.info(f"Deleted object: {path} ({size} bytes)") + logger.info(f"Deleted schema path: {path} ({size} bytes)") except Exception as e: errors += 1 - logger.warning(f"Failed to delete object {path}: {e}") + logger.warning(f"Failed to delete schema path {path}: {e}") return { "referenced": stats["referenced"], "stored": stats["stored"], "orphaned": stats["orphaned"], - "content_deleted": content_deleted, - "object_deleted": object_deleted, - "deleted": content_deleted + object_deleted, + "hash_deleted": hash_deleted, + "schema_paths_deleted": schema_paths_deleted, + "deleted": hash_deleted + schema_paths_deleted, "bytes_freed": bytes_freed, "errors": errors, "dry_run": dry_run, # Include detailed stats - "content_orphaned": stats["content_orphaned"], - "object_orphaned": stats["object_orphaned"], + "hash_orphaned": stats["hash_orphaned"], + "schema_paths_orphaned": stats["schema_paths_orphaned"], } @@ -609,26 +640,26 @@ def format_stats(stats: dict[str, Any]) -> str: """ lines = ["External Storage Statistics:"] - # Show content-addressed storage stats if present - if "content_referenced" in stats: + # Show hash-addressed storage stats if present + if "hash_referenced" in stats: lines.append("") - lines.append("Content-Addressed Storage (, , ):") - lines.append(f" Referenced: {stats['content_referenced']}") - lines.append(f" Stored: {stats['content_stored']}") - lines.append(f" Orphaned: {stats['content_orphaned']}") - if "content_orphaned_bytes" in stats: - size_mb = stats["content_orphaned_bytes"] / (1024 * 1024) + lines.append("Hash-Addressed Storage (, , ):") + lines.append(f" Referenced: {stats['hash_referenced']}") + lines.append(f" Stored: {stats['hash_stored']}") + lines.append(f" Orphaned: {stats['hash_orphaned']}") + if "hash_orphaned_bytes" in stats: + size_mb = stats["hash_orphaned_bytes"] / (1024 * 1024) lines.append(f" Orphaned size: {size_mb:.2f} MB") - # Show path-addressed storage stats if present - if "object_referenced" in stats: + # Show schema-addressed storage stats if present + if "schema_paths_referenced" in stats: lines.append("") - lines.append("Path-Addressed Storage ():") - lines.append(f" Referenced: {stats['object_referenced']}") - lines.append(f" Stored: {stats['object_stored']}") - lines.append(f" Orphaned: {stats['object_orphaned']}") - if "object_orphaned_bytes" in stats: - size_mb = stats["object_orphaned_bytes"] / (1024 * 1024) + lines.append("Schema-Addressed Storage (, ):") + lines.append(f" Referenced: {stats['schema_paths_referenced']}") + lines.append(f" Stored: {stats['schema_paths_stored']}") + lines.append(f" Orphaned: {stats['schema_paths_orphaned']}") + if "schema_paths_orphaned_bytes" in stats: + size_mb = stats["schema_paths_orphaned_bytes"] / (1024 * 1024) lines.append(f" Orphaned size: {size_mb:.2f} MB") # Show totals @@ -649,10 +680,10 @@ def format_stats(stats: dict[str, Any]) -> str: lines.append(" [DRY RUN - no changes made]") else: lines.append(f" Deleted: {stats['deleted']}") - if "content_deleted" in stats: - lines.append(f" Content: {stats['content_deleted']}") - if "object_deleted" in stats: - lines.append(f" Objects: {stats['object_deleted']}") + if "hash_deleted" in stats: + lines.append(f" Hash items: {stats['hash_deleted']}") + if "schema_paths_deleted" in stats: + lines.append(f" Schema paths: {stats['schema_paths_deleted']}") freed_mb = stats["bytes_freed"] / (1024 * 1024) lines.append(f" Bytes freed: {freed_mb:.2f} MB") if stats.get("errors", 0) > 0: diff --git a/src/datajoint/hash_registry.py b/src/datajoint/hash_registry.py new file mode 100644 index 000000000..7b286e874 --- /dev/null +++ b/src/datajoint/hash_registry.py @@ -0,0 +1,415 @@ +""" +Hash-addressed storage registry for DataJoint. + +This module provides hash-addressed storage with deduplication for the ```` +codec. Content is identified by a Base32-encoded MD5 hash and stored with +per-schema isolation:: + + _hash/{schema}/{hash} + +With optional subfolding (configured per-store):: + + _hash/{schema}/{fold1}/{fold2}/{hash} + +Subfolding creates directory hierarchies to improve performance on filesystems +that struggle with large directories (ext3, FAT32, NFS). Modern filesystems +(ext4, XFS, ZFS, S3) handle flat directories efficiently. + +**Storage Model:** + +- **Hash** is used for content identification (deduplication, integrity verification) +- **Path** is always stored in metadata and used for all file operations + +This design protects against configuration changes (e.g., subfolding) affecting +existing data. The path stored at insert time is always used for retrieval. + +Hash-addressed storage is used by ````, ````, and ```` types. +Deduplication occurs within each schema. Deletion requires garbage collection +via ``dj.gc.collect()``. + +See Also +-------- +datajoint.gc : Garbage collection for orphaned storage items. +""" + +import base64 +import hashlib +import logging +from typing import Any + +from .errors import DataJointError +from .settings import config +from .storage import StorageBackend + +logger = logging.getLogger(__name__.split(".")[0]) + + +def compute_hash(data: bytes) -> str: + """ + Compute Base32-encoded MD5 hash of content. + + Parameters + ---------- + data : bytes + Content bytes. + + Returns + ------- + str + Base32-encoded hash (26 lowercase characters, no padding). + """ + md5_digest = hashlib.md5(data).digest() + # Base32 encode, remove padding, lowercase for filesystem compatibility + return base64.b32encode(md5_digest).decode("ascii").rstrip("=").lower() + + +def _subfold(name: str, folds: tuple[int, ...]) -> tuple[str, ...]: + """ + Create subfolding hierarchy from a hash string. + + Parameters + ---------- + name : str + Hash string to subfold. + folds : tuple[int, ...] + Lengths of each subfolder level. + + Returns + ------- + tuple[str, ...] + Subfolder names. + + Examples + -------- + >>> _subfold("abcdefgh", (2, 3)) + ('ab', 'cde') + """ + if not folds: + return () + return (name[: folds[0]],) + _subfold(name[folds[0] :], folds[1:]) + + +def build_hash_path( + content_hash: str, + schema_name: str, + subfolding: tuple[int, ...] | None = None, +) -> str: + """ + Build the storage path for hash-addressed storage. + + Path structure without subfolding:: + + _hash/{schema}/{hash} + + Path structure with subfolding (e.g., (2, 2)):: + + _hash/{schema}/{fold1}/{fold2}/{hash} + + Parameters + ---------- + content_hash : str + Base32-encoded hash (26 characters). + schema_name : str + Database/schema name for isolation. + subfolding : tuple[int, ...], optional + Subfolding pattern from store config. None means flat (no subfolding). + + Returns + ------- + str + Relative path within the store. + """ + # Validate hash format (26 base32 chars, lowercase alphanumeric) + if not (len(content_hash) == 26 and content_hash.isalnum() and content_hash.islower()): + raise DataJointError(f"Invalid content hash (expected 26-char lowercase base32): {content_hash}") + + if subfolding: + folds = _subfold(content_hash, subfolding) + fold_path = "/".join(folds) + return f"_hash/{schema_name}/{fold_path}/{content_hash}" + else: + return f"_hash/{schema_name}/{content_hash}" + + +def get_store_backend(store_name: str | None = None) -> StorageBackend: + """ + Get a StorageBackend for hash-addressed storage. + + Parameters + ---------- + store_name : str, optional + Name of the store to use. If None, uses the default object storage + configuration or the configured default_store. + + Returns + ------- + StorageBackend + StorageBackend instance. + """ + # If store_name is None, check for configured default_store + if store_name is None and config.object_storage.default_store: + store_name = config.object_storage.default_store + + # get_object_store_spec handles None by returning default object_storage config + spec = config.get_object_store_spec(store_name) + return StorageBackend(spec) + + +def get_store_subfolding(store_name: str | None = None) -> tuple[int, ...] | None: + """ + Get the subfolding configuration for a store. + + Parameters + ---------- + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + tuple[int, ...] | None + Subfolding pattern (e.g., (2, 2)) or None for flat storage. + """ + spec = config.get_object_store_spec(store_name) + subfolding = spec.get("subfolding") + if subfolding is not None: + return tuple(subfolding) + return None + + +def put_hash( + data: bytes, + schema_name: str, + store_name: str | None = None, +) -> dict[str, Any]: + """ + Store content using hash-addressed storage. + + If the content already exists (same hash in same schema), it is not + re-uploaded. Returns metadata including the hash, path, store, and size. + + The path is always stored in metadata and used for retrieval, protecting + against configuration changes (e.g., subfolding) affecting existing data. + + Parameters + ---------- + data : bytes + Content bytes to store. + schema_name : str + Database/schema name for path isolation. + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + dict[str, Any] + Metadata dict with keys: hash, path, schema, store, size. + """ + content_hash = compute_hash(data) + subfolding = get_store_subfolding(store_name) + path = build_hash_path(content_hash, schema_name, subfolding) + + backend = get_store_backend(store_name) + + # Check if content already exists (deduplication within schema) + if not backend.exists(path): + backend.put_buffer(data, path) + logger.debug(f"Stored new hash: {content_hash} ({len(data)} bytes)") + else: + logger.debug(f"Hash already exists: {content_hash}") + + return { + "hash": content_hash, + "path": path, # Always stored for retrieval + "schema": schema_name, + "store": store_name, + "size": len(data), + } + + +def get_hash(metadata: dict[str, Any]) -> bytes: + """ + Retrieve content using stored metadata. + + Uses the stored path directly (not derived from hash) to protect against + configuration changes affecting existing data. + + Parameters + ---------- + metadata : dict + Metadata dict with keys: path, hash, store (optional). + + Returns + ------- + bytes + Content bytes. + + Raises + ------ + MissingExternalFile + If content is not found at the stored path. + DataJointError + If hash verification fails (data corruption). + """ + path = metadata["path"] + expected_hash = metadata["hash"] + store_name = metadata.get("store") + + backend = get_store_backend(store_name) + data = backend.get_buffer(path) + + # Verify hash for integrity + actual_hash = compute_hash(data) + if actual_hash != expected_hash: + raise DataJointError( + f"Hash mismatch: expected {expected_hash}, got {actual_hash}. " f"Data at {path} may be corrupted." + ) + + return data + + +def hash_exists( + content_hash: str, + schema_name: str, + store_name: str | None = None, +) -> bool: + """ + Check if hash-addressed content exists in storage. + + Parameters + ---------- + content_hash : str + Base32-encoded hash (26 characters). + schema_name : str + Database/schema name for path isolation. + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + bool + True if content exists. + """ + subfolding = get_store_subfolding(store_name) + path = build_hash_path(content_hash, schema_name, subfolding) + backend = get_store_backend(store_name) + return backend.exists(path) + + +def delete_path( + path: str, + store_name: str | None = None, +) -> bool: + """ + Delete content at the specified path from storage. + + This should only be called after verifying no references exist. + Use garbage collection to safely remove unreferenced content. + + Parameters + ---------- + path : str + Storage path (as stored in metadata). + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + bool + True if content was deleted, False if it didn't exist. + + Warnings + -------- + This permanently deletes content. Ensure no references exist first. + """ + backend = get_store_backend(store_name) + + if backend.exists(path): + backend.remove(path) + logger.debug(f"Deleted: {path}") + return True + return False + + +# Backward compatibility alias +def delete_hash( + content_hash: str, + schema_name: str, + store_name: str | None = None, +) -> bool: + """ + Delete hash-addressed content from storage (deprecated). + + .. deprecated:: + Use :func:`delete_path` with the stored path instead. + + Parameters + ---------- + content_hash : str + Base32-encoded hash (26 characters). + schema_name : str + Database/schema name for path isolation. + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + bool + True if content was deleted, False if it didn't exist. + """ + subfolding = get_store_subfolding(store_name) + path = build_hash_path(content_hash, schema_name, subfolding) + return delete_path(path, store_name) + + +def get_size( + path: str, + store_name: str | None = None, +) -> int: + """ + Get the size of content at the specified path. + + Parameters + ---------- + path : str + Storage path (as stored in metadata). + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + int + Size in bytes. + """ + backend = get_store_backend(store_name) + return backend.size(path) + + +# Backward compatibility alias +def get_hash_size( + content_hash: str, + schema_name: str, + store_name: str | None = None, +) -> int: + """ + Get the size of hash-addressed content (deprecated). + + .. deprecated:: + Use :func:`get_size` with the stored path instead. + + Parameters + ---------- + content_hash : str + Base32-encoded hash (26 characters). + schema_name : str + Database/schema name for path isolation. + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + int + Size in bytes. + """ + subfolding = get_store_subfolding(store_name) + path = build_hash_path(content_hash, schema_name, subfolding) + return get_size(path, store_name) diff --git a/tests/integration/test_content_storage.py b/tests/integration/test_content_storage.py deleted file mode 100644 index e6d0f14cc..000000000 --- a/tests/integration/test_content_storage.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -Tests for content-addressed storage (content_registry.py). -""" - -import hashlib -from unittest.mock import MagicMock, patch - -import pytest - -from datajoint.content_registry import ( - build_content_path, - compute_content_hash, - content_exists, - delete_content, - get_content, - get_content_size, - put_content, -) -from datajoint.errors import DataJointError - - -class TestComputeContentHash: - """Tests for compute_content_hash function.""" - - def test_computes_sha256(self): - """Test that SHA256 hash is computed correctly.""" - data = b"Hello, World!" - result = compute_content_hash(data) - - # Verify against known SHA256 hash - expected = hashlib.sha256(data).hexdigest() - assert result == expected - assert len(result) == 64 # SHA256 produces 64 hex chars - - def test_empty_bytes(self): - """Test hashing empty bytes.""" - result = compute_content_hash(b"") - expected = hashlib.sha256(b"").hexdigest() - assert result == expected - - def test_different_content_different_hash(self): - """Test that different content produces different hashes.""" - hash1 = compute_content_hash(b"content1") - hash2 = compute_content_hash(b"content2") - assert hash1 != hash2 - - def test_same_content_same_hash(self): - """Test that same content produces same hash.""" - data = b"identical content" - hash1 = compute_content_hash(data) - hash2 = compute_content_hash(data) - assert hash1 == hash2 - - -class TestBuildContentPath: - """Tests for build_content_path function.""" - - def test_builds_hierarchical_path(self): - """Test that path is built with proper hierarchy.""" - # Example hash: abcdef... - test_hash = "abcdef0123456789" * 4 # 64 chars - result = build_content_path(test_hash) - - # Path should be _content/{hash[:2]}/{hash[2:4]}/{hash} - assert result == f"_content/ab/cd/{test_hash}" - - def test_rejects_invalid_hash_length(self): - """Test that invalid hash length raises error.""" - with pytest.raises(DataJointError, match="Invalid content hash length"): - build_content_path("tooshort") - - with pytest.raises(DataJointError, match="Invalid content hash length"): - build_content_path("a" * 65) # Too long - - def test_real_hash_path(self): - """Test path building with a real computed hash.""" - data = b"test content" - content_hash = compute_content_hash(data) - path = build_content_path(content_hash) - - # Verify structure - parts = path.split("/") - assert parts[0] == "_content" - assert len(parts[1]) == 2 - assert len(parts[2]) == 2 - assert len(parts[3]) == 64 - assert parts[1] == content_hash[:2] - assert parts[2] == content_hash[2:4] - assert parts[3] == content_hash - - -class TestPutContent: - """Tests for put_content function.""" - - @patch("datajoint.content_registry.get_store_backend") - def test_stores_new_content(self, mock_get_backend): - """Test storing new content.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = False - mock_get_backend.return_value = mock_backend - - data = b"new content" - result = put_content(data, store_name="test_store") - - # Verify return value - assert "hash" in result - assert result["hash"] == compute_content_hash(data) - assert result["store"] == "test_store" - assert result["size"] == len(data) - - # Verify backend was called - mock_backend.put_buffer.assert_called_once() - - @patch("datajoint.content_registry.get_store_backend") - def test_deduplicates_existing_content(self, mock_get_backend): - """Test that existing content is not re-uploaded.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = True # Content already exists - mock_get_backend.return_value = mock_backend - - data = b"existing content" - result = put_content(data, store_name="test_store") - - # Verify return value is still correct - assert result["hash"] == compute_content_hash(data) - assert result["size"] == len(data) - - # Verify put_buffer was NOT called (deduplication) - mock_backend.put_buffer.assert_not_called() - - -class TestGetContent: - """Tests for get_content function.""" - - @patch("datajoint.content_registry.get_store_backend") - def test_retrieves_content(self, mock_get_backend): - """Test retrieving content by hash.""" - data = b"stored content" - content_hash = compute_content_hash(data) - - mock_backend = MagicMock() - mock_backend.get_buffer.return_value = data - mock_get_backend.return_value = mock_backend - - result = get_content(content_hash, store_name="test_store") - - assert result == data - - @patch("datajoint.content_registry.get_store_backend") - def test_verifies_hash(self, mock_get_backend): - """Test that hash is verified on retrieval.""" - data = b"original content" - content_hash = compute_content_hash(data) - - # Return corrupted data - mock_backend = MagicMock() - mock_backend.get_buffer.return_value = b"corrupted content" - mock_get_backend.return_value = mock_backend - - with pytest.raises(DataJointError, match="Content hash mismatch"): - get_content(content_hash, store_name="test_store") - - -class TestContentExists: - """Tests for content_exists function.""" - - @patch("datajoint.content_registry.get_store_backend") - def test_returns_true_when_exists(self, mock_get_backend): - """Test that True is returned when content exists.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = True - mock_get_backend.return_value = mock_backend - - content_hash = "a" * 64 - assert content_exists(content_hash, store_name="test_store") is True - - @patch("datajoint.content_registry.get_store_backend") - def test_returns_false_when_not_exists(self, mock_get_backend): - """Test that False is returned when content doesn't exist.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = False - mock_get_backend.return_value = mock_backend - - content_hash = "a" * 64 - assert content_exists(content_hash, store_name="test_store") is False - - -class TestDeleteContent: - """Tests for delete_content function.""" - - @patch("datajoint.content_registry.get_store_backend") - def test_deletes_existing_content(self, mock_get_backend): - """Test deleting existing content.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = True - mock_get_backend.return_value = mock_backend - - content_hash = "a" * 64 - result = delete_content(content_hash, store_name="test_store") - - assert result is True - mock_backend.remove.assert_called_once() - - @patch("datajoint.content_registry.get_store_backend") - def test_returns_false_for_nonexistent(self, mock_get_backend): - """Test that False is returned when content doesn't exist.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = False - mock_get_backend.return_value = mock_backend - - content_hash = "a" * 64 - result = delete_content(content_hash, store_name="test_store") - - assert result is False - mock_backend.remove.assert_not_called() - - -class TestGetContentSize: - """Tests for get_content_size function.""" - - @patch("datajoint.content_registry.get_store_backend") - def test_returns_size(self, mock_get_backend): - """Test getting content size.""" - mock_backend = MagicMock() - mock_backend.size.return_value = 1024 - mock_get_backend.return_value = mock_backend - - content_hash = "a" * 64 - result = get_content_size(content_hash, store_name="test_store") - - assert result == 1024 diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index e0c5fafca..7eca79f37 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -10,15 +10,15 @@ from datajoint.errors import DataJointError -class TestUsesContentStorage: - """Tests for _uses_content_storage helper function.""" +class TestUsesHashStorage: + """Tests for _uses_hash_storage helper function.""" def test_returns_false_for_no_adapter(self): """Test that False is returned when attribute has no codec.""" attr = MagicMock() attr.codec = None - assert gc._uses_content_storage(attr) is False + assert gc._uses_hash_storage(attr) is False def test_returns_true_for_hash_type(self): """Test that True is returned for type.""" @@ -27,7 +27,7 @@ def test_returns_true_for_hash_type(self): attr.codec.name = "hash" attr.store = "mystore" - assert gc._uses_content_storage(attr) is True + assert gc._uses_hash_storage(attr) is True def test_returns_true_for_blob_external(self): """Test that True is returned for type (external).""" @@ -36,7 +36,7 @@ def test_returns_true_for_blob_external(self): attr.codec.name = "blob" attr.store = "mystore" - assert gc._uses_content_storage(attr) is True + assert gc._uses_hash_storage(attr) is True def test_returns_true_for_attach_external(self): """Test that True is returned for type (external).""" @@ -45,7 +45,7 @@ def test_returns_true_for_attach_external(self): attr.codec.name = "attach" attr.store = "mystore" - assert gc._uses_content_storage(attr) is True + assert gc._uses_hash_storage(attr) is True def test_returns_false_for_blob_internal(self): """Test that False is returned for internal storage.""" @@ -54,94 +54,102 @@ def test_returns_false_for_blob_internal(self): attr.codec.name = "blob" attr.store = None - assert gc._uses_content_storage(attr) is False + assert gc._uses_hash_storage(attr) is False -class TestExtractContentRefs: - """Tests for _extract_content_refs helper function.""" +class TestExtractHashRefs: + """Tests for _extract_hash_refs helper function.""" def test_returns_empty_for_none(self): """Test that empty list is returned for None value.""" - assert gc._extract_content_refs(None) == [] + assert gc._extract_hash_refs(None) == [] def test_parses_json_string(self): - """Test parsing JSON string with hash.""" - value = '{"hash": "abc123", "store": "mystore"}' - refs = gc._extract_content_refs(value) + """Test parsing JSON string with path.""" + value = '{"path": "_hash/schema/abc123", "hash": "abc123", "store": "mystore"}' + refs = gc._extract_hash_refs(value) assert len(refs) == 1 - assert refs[0] == ("abc123", "mystore") + assert refs[0] == ("_hash/schema/abc123", "mystore") def test_parses_dict_directly(self): - """Test parsing dict with hash.""" - value = {"hash": "def456", "store": None} - refs = gc._extract_content_refs(value) + """Test parsing dict with path.""" + value = {"path": "_hash/schema/def456", "hash": "def456", "store": None} + refs = gc._extract_hash_refs(value) assert len(refs) == 1 - assert refs[0] == ("def456", None) + assert refs[0] == ("_hash/schema/def456", None) def test_returns_empty_for_invalid_json(self): """Test that empty list is returned for invalid JSON.""" - assert gc._extract_content_refs("not json") == [] + assert gc._extract_hash_refs("not json") == [] - def test_returns_empty_for_dict_without_hash(self): - """Test that empty list is returned for dict without hash key.""" - assert gc._extract_content_refs({"other": "data"}) == [] + def test_returns_empty_for_dict_without_path(self): + """Test that empty list is returned for dict without path key.""" + assert gc._extract_hash_refs({"hash": "abc123"}) == [] -class TestUsesObjectStorage: - """Tests for _uses_object_storage helper function.""" +class TestUsesSchemaStorage: + """Tests for _uses_schema_storage helper function.""" def test_returns_false_for_no_adapter(self): """Test that False is returned when attribute has no codec.""" attr = MagicMock() attr.codec = None - assert gc._uses_object_storage(attr) is False + assert gc._uses_schema_storage(attr) is False def test_returns_true_for_object_type(self): - """Test that True is returned for type.""" + """Test that True is returned for type.""" attr = MagicMock() attr.codec = MagicMock() attr.codec.name = "object" - assert gc._uses_object_storage(attr) is True + assert gc._uses_schema_storage(attr) is True + + def test_returns_true_for_npy_type(self): + """Test that True is returned for type.""" + attr = MagicMock() + attr.codec = MagicMock() + attr.codec.name = "npy" + + assert gc._uses_schema_storage(attr) is True def test_returns_false_for_other_types(self): - """Test that False is returned for non-object types.""" + """Test that False is returned for non-schema-addressed types.""" attr = MagicMock() attr.codec = MagicMock() attr.codec.name = "blob" - assert gc._uses_object_storage(attr) is False + assert gc._uses_schema_storage(attr) is False -class TestExtractObjectRefs: - """Tests for _extract_object_refs helper function.""" +class TestExtractSchemaRefs: + """Tests for _extract_schema_refs helper function.""" def test_returns_empty_for_none(self): """Test that empty list is returned for None value.""" - assert gc._extract_object_refs(None) == [] + assert gc._extract_schema_refs(None) == [] def test_parses_json_string(self): """Test parsing JSON string with path.""" - value = '{"path": "schema/table/objects/pk/field_abc123", "store": "mystore"}' - refs = gc._extract_object_refs(value) + value = '{"path": "schema/table/pk/field", "store": "mystore"}' + refs = gc._extract_schema_refs(value) assert len(refs) == 1 - assert refs[0] == ("schema/table/objects/pk/field_abc123", "mystore") + assert refs[0] == ("schema/table/pk/field", "mystore") def test_parses_dict_directly(self): """Test parsing dict with path.""" value = {"path": "test/path", "store": None} - refs = gc._extract_object_refs(value) + refs = gc._extract_schema_refs(value) assert len(refs) == 1 assert refs[0] == ("test/path", None) def test_returns_empty_for_dict_without_path(self): """Test that empty list is returned for dict without path key.""" - assert gc._extract_object_refs({"other": "data"}) == [] + assert gc._extract_schema_refs({"other": "data"}) == [] class TestScan: @@ -152,46 +160,46 @@ def test_requires_at_least_one_schema(self): with pytest.raises(DataJointError, match="At least one schema must be provided"): gc.scan() - @patch("datajoint.gc.scan_object_references") - @patch("datajoint.gc.list_stored_objects") - @patch("datajoint.gc.scan_references") - @patch("datajoint.gc.list_stored_content") - def test_returns_stats(self, mock_list_content, mock_scan_refs, mock_list_objects, mock_scan_objects): + @patch("datajoint.gc.scan_schema_references") + @patch("datajoint.gc.list_schema_paths") + @patch("datajoint.gc.scan_hash_references") + @patch("datajoint.gc.list_stored_hashes") + def test_returns_stats(self, mock_list_hashes, mock_scan_hash, mock_list_schemas, mock_scan_schema): """Test that scan returns proper statistics.""" - # Mock content-addressed storage - mock_scan_refs.return_value = {"hash1", "hash2"} - mock_list_content.return_value = { - "hash1": 100, - "hash3": 200, # orphaned + # Mock hash-addressed storage (now uses paths) + mock_scan_hash.return_value = {"_hash/schema/path1", "_hash/schema/path2"} + mock_list_hashes.return_value = { + "_hash/schema/path1": 100, + "_hash/schema/path3": 200, # orphaned } - # Mock path-addressed storage - mock_scan_objects.return_value = {"path/to/obj1"} - mock_list_objects.return_value = { - "path/to/obj1": 500, - "path/to/obj2": 300, # orphaned + # Mock schema-addressed storage + mock_scan_schema.return_value = {"schema/table/pk1/field"} + mock_list_schemas.return_value = { + "schema/table/pk1/field": 500, + "schema/table/pk2/field": 300, # orphaned } mock_schema = MagicMock() stats = gc.scan(mock_schema, store_name="test_store") - # Content stats - assert stats["content_referenced"] == 2 - assert stats["content_stored"] == 2 - assert stats["content_orphaned"] == 1 - assert "hash3" in stats["orphaned_hashes"] + # Hash stats + assert stats["hash_referenced"] == 2 + assert stats["hash_stored"] == 2 + assert stats["hash_orphaned"] == 1 + assert "_hash/schema/path3" in stats["orphaned_hashes"] - # Object stats - assert stats["object_referenced"] == 1 - assert stats["object_stored"] == 2 - assert stats["object_orphaned"] == 1 - assert "path/to/obj2" in stats["orphaned_paths"] + # Schema stats + assert stats["schema_paths_referenced"] == 1 + assert stats["schema_paths_stored"] == 2 + assert stats["schema_paths_orphaned"] == 1 + assert "schema/table/pk2/field" in stats["orphaned_paths"] # Combined totals assert stats["referenced"] == 3 assert stats["stored"] == 4 assert stats["orphaned"] == 2 - assert stats["orphaned_bytes"] == 500 # 200 content + 300 object + assert stats["orphaned_bytes"] == 500 # 200 hash + 300 schema class TestCollect: @@ -205,10 +213,10 @@ def test_dry_run_does_not_delete(self, mock_scan): "stored": 2, "orphaned": 1, "orphaned_bytes": 100, - "orphaned_hashes": ["orphan_hash"], + "orphaned_hashes": ["_hash/schema/orphan_path"], "orphaned_paths": [], - "content_orphaned": 1, - "object_orphaned": 0, + "hash_orphaned": 1, + "schema_paths_orphaned": 0, } mock_schema = MagicMock() @@ -218,59 +226,59 @@ def test_dry_run_does_not_delete(self, mock_scan): assert stats["bytes_freed"] == 0 assert stats["dry_run"] is True - @patch("datajoint.gc.delete_content") - @patch("datajoint.gc.list_stored_content") + @patch("datajoint.gc.delete_path") + @patch("datajoint.gc.list_stored_hashes") @patch("datajoint.gc.scan") - def test_deletes_orphaned_content(self, mock_scan, mock_list_stored, mock_delete): - """Test that orphaned content is deleted when dry_run=False.""" + def test_deletes_orphaned_hashes(self, mock_scan, mock_list_stored, mock_delete): + """Test that orphaned hashes are deleted when dry_run=False.""" mock_scan.return_value = { "referenced": 1, "stored": 2, "orphaned": 1, "orphaned_bytes": 100, - "orphaned_hashes": ["orphan_hash"], + "orphaned_hashes": ["_hash/schema/orphan_path"], "orphaned_paths": [], - "content_orphaned": 1, - "object_orphaned": 0, + "hash_orphaned": 1, + "schema_paths_orphaned": 0, } - mock_list_stored.return_value = {"orphan_hash": 100} + mock_list_stored.return_value = {"_hash/schema/orphan_path": 100} mock_delete.return_value = True mock_schema = MagicMock() stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) assert stats["deleted"] == 1 - assert stats["content_deleted"] == 1 + assert stats["hash_deleted"] == 1 assert stats["bytes_freed"] == 100 assert stats["dry_run"] is False - mock_delete.assert_called_once_with("orphan_hash", "test_store") + mock_delete.assert_called_once_with("_hash/schema/orphan_path", "test_store") - @patch("datajoint.gc.delete_object") - @patch("datajoint.gc.list_stored_objects") + @patch("datajoint.gc.delete_schema_path") + @patch("datajoint.gc.list_schema_paths") @patch("datajoint.gc.scan") - def test_deletes_orphaned_objects(self, mock_scan, mock_list_objects, mock_delete): - """Test that orphaned objects are deleted when dry_run=False.""" + def test_deletes_orphaned_schemas(self, mock_scan, mock_list_schemas, mock_delete): + """Test that orphaned schema paths are deleted when dry_run=False.""" mock_scan.return_value = { "referenced": 1, "stored": 2, "orphaned": 1, "orphaned_bytes": 500, "orphaned_hashes": [], - "orphaned_paths": ["path/to/orphan"], - "content_orphaned": 0, - "object_orphaned": 1, + "orphaned_paths": ["schema/table/pk/field"], + "hash_orphaned": 0, + "schema_paths_orphaned": 1, } - mock_list_objects.return_value = {"path/to/orphan": 500} + mock_list_schemas.return_value = {"schema/table/pk/field": 500} mock_delete.return_value = True mock_schema = MagicMock() stats = gc.collect(mock_schema, store_name="test_store", dry_run=False) assert stats["deleted"] == 1 - assert stats["object_deleted"] == 1 + assert stats["schema_paths_deleted"] == 1 assert stats["bytes_freed"] == 500 assert stats["dry_run"] is False - mock_delete.assert_called_once_with("path/to/orphan", "test_store") + mock_delete.assert_called_once_with("schema/table/pk/field", "test_store") class TestFormatStats: @@ -283,14 +291,14 @@ def test_formats_scan_stats(self): "stored": 15, "orphaned": 5, "orphaned_bytes": 1024 * 1024, # 1 MB - "content_referenced": 6, - "content_stored": 8, - "content_orphaned": 2, - "content_orphaned_bytes": 512 * 1024, - "object_referenced": 4, - "object_stored": 7, - "object_orphaned": 3, - "object_orphaned_bytes": 512 * 1024, + "hash_referenced": 6, + "hash_stored": 8, + "hash_orphaned": 2, + "hash_orphaned_bytes": 512 * 1024, + "schema_paths_referenced": 4, + "schema_paths_stored": 7, + "schema_paths_orphaned": 3, + "schema_paths_orphaned_bytes": 512 * 1024, } result = gc.format_stats(stats) @@ -300,8 +308,8 @@ def test_formats_scan_stats(self): assert "Orphaned (unreferenced): 5" in result assert "1.00 MB" in result # Check for detailed sections - assert "Content-Addressed Storage" in result - assert "Path-Addressed Storage" in result + assert "Hash-Addressed Storage" in result + assert "Schema-Addressed Storage" in result def test_formats_collect_stats_dry_run(self): """Test formatting collect statistics with dry_run.""" @@ -325,8 +333,8 @@ def test_formats_collect_stats_actual(self): "stored": 15, "orphaned": 5, "deleted": 3, - "content_deleted": 2, - "object_deleted": 1, + "hash_deleted": 2, + "schema_paths_deleted": 1, "bytes_freed": 2 * 1024 * 1024, # 2 MB "errors": 2, "dry_run": False, @@ -335,7 +343,7 @@ def test_formats_collect_stats_actual(self): result = gc.format_stats(stats) assert "Deleted: 3" in result - assert "Content: 2" in result - assert "Objects: 1" in result + assert "Hash items: 2" in result + assert "Schema paths: 1" in result assert "2.00 MB" in result assert "Errors: 2" in result diff --git a/tests/integration/test_hash_storage.py b/tests/integration/test_hash_storage.py new file mode 100644 index 000000000..8cd8b5b93 --- /dev/null +++ b/tests/integration/test_hash_storage.py @@ -0,0 +1,304 @@ +""" +Tests for hash-addressed storage (hash_registry.py). +""" + +import re +from unittest.mock import MagicMock, patch + +import pytest + +from datajoint.hash_registry import ( + build_hash_path, + compute_hash, + hash_exists, + delete_path, + delete_hash, + get_hash, + get_size, + get_hash_size, + put_hash, +) +from datajoint.errors import DataJointError + + +# Base32 pattern for validation (26 lowercase alphanumeric chars) +BASE32_PATTERN = re.compile(r"^[a-z2-7]{26}$") + + +class TestComputeHash: + """Tests for compute_hash function.""" + + def test_returns_base32_format(self): + """Test that hash is returned as Base32 string.""" + data = b"Hello, World!" + result = compute_hash(data) + + # Should be valid Base32 format (26 lowercase chars) + assert len(result) == 26 + assert BASE32_PATTERN.match(result) + + def test_empty_bytes(self): + """Test hashing empty bytes.""" + result = compute_hash(b"") + assert BASE32_PATTERN.match(result) + + def test_different_content_different_hash(self): + """Test that different content produces different hashes.""" + hash1 = compute_hash(b"content1") + hash2 = compute_hash(b"content2") + assert hash1 != hash2 + + def test_same_content_same_hash(self): + """Test that same content produces same hash.""" + data = b"identical content" + hash1 = compute_hash(data) + hash2 = compute_hash(data) + assert hash1 == hash2 + + +class TestBuildHashPath: + """Tests for build_hash_path function.""" + + def test_builds_flat_path(self): + """Test that path is built as _hash/{schema}/{hash}.""" + test_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # 26 char base32 + result = build_hash_path(test_hash, "my_schema") + + assert result == f"_hash/my_schema/{test_hash}" + + def test_builds_subfolded_path(self): + """Test path with subfolding.""" + test_hash = "abcdefghijklmnopqrstuvwxyz"[:26] + result = build_hash_path(test_hash, "my_schema", subfolding=(2, 2)) + + assert result == f"_hash/my_schema/ab/cd/{test_hash}" + + def test_rejects_invalid_hash(self): + """Test that invalid hash raises error.""" + with pytest.raises(DataJointError, match="Invalid content hash"): + build_hash_path("not-a-hash", "my_schema") + + with pytest.raises(DataJointError, match="Invalid content hash"): + build_hash_path("a" * 64, "my_schema") # Too long + + with pytest.raises(DataJointError, match="Invalid content hash"): + build_hash_path("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[:26], "my_schema") # Uppercase + + def test_real_hash_path(self): + """Test path building with a real computed hash.""" + data = b"test content" + content_hash = compute_hash(data) + path = build_hash_path(content_hash, "test_schema") + + # Verify structure: _hash/{schema}/{hash} + parts = path.split("/") + assert len(parts) == 3 + assert parts[0] == "_hash" + assert parts[1] == "test_schema" + assert parts[2] == content_hash + assert BASE32_PATTERN.match(parts[2]) + + +class TestPutHash: + """Tests for put_hash function.""" + + @patch("datajoint.hash_registry.get_store_subfolding") + @patch("datajoint.hash_registry.get_store_backend") + def test_stores_new_content(self, mock_get_backend, mock_get_subfolding): + """Test storing new content.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + mock_get_subfolding.return_value = None + + data = b"new content" + result = put_hash(data, schema_name="test_schema", store_name="test_store") + + # Verify return value includes hash and path + assert "hash" in result + assert "path" in result + assert result["hash"] == compute_hash(data) + assert result["path"] == f"_hash/test_schema/{result['hash']}" + assert result["schema"] == "test_schema" + assert result["store"] == "test_store" + assert result["size"] == len(data) + + # Verify backend was called + mock_backend.put_buffer.assert_called_once() + + @patch("datajoint.hash_registry.get_store_subfolding") + @patch("datajoint.hash_registry.get_store_backend") + def test_deduplicates_existing_content(self, mock_get_backend, mock_get_subfolding): + """Test that existing content is not re-uploaded.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True # Content already exists + mock_get_backend.return_value = mock_backend + mock_get_subfolding.return_value = None + + data = b"existing content" + result = put_hash(data, schema_name="test_schema", store_name="test_store") + + # Verify return value is still correct + assert result["hash"] == compute_hash(data) + assert "path" in result + assert result["schema"] == "test_schema" + assert result["size"] == len(data) + + # Verify put_buffer was NOT called (deduplication) + mock_backend.put_buffer.assert_not_called() + + +class TestGetHash: + """Tests for get_hash function.""" + + @patch("datajoint.hash_registry.get_store_backend") + def test_retrieves_content(self, mock_get_backend): + """Test retrieving content using metadata.""" + data = b"stored content" + content_hash = compute_hash(data) + + mock_backend = MagicMock() + mock_backend.get_buffer.return_value = data + mock_get_backend.return_value = mock_backend + + metadata = { + "hash": content_hash, + "path": f"_hash/test_schema/{content_hash}", + "store": "test_store", + } + result = get_hash(metadata) + + assert result == data + mock_backend.get_buffer.assert_called_once_with(metadata["path"]) + + @patch("datajoint.hash_registry.get_store_backend") + def test_verifies_hash(self, mock_get_backend): + """Test that hash is verified on retrieval.""" + data = b"original content" + content_hash = compute_hash(data) + + # Return corrupted data + mock_backend = MagicMock() + mock_backend.get_buffer.return_value = b"corrupted content" + mock_get_backend.return_value = mock_backend + + metadata = { + "hash": content_hash, + "path": f"_hash/test_schema/{content_hash}", + "store": "test_store", + } + + with pytest.raises(DataJointError, match="Hash mismatch"): + get_hash(metadata) + + +class TestHashExists: + """Tests for hash_exists function.""" + + @patch("datajoint.hash_registry.get_store_subfolding") + @patch("datajoint.hash_registry.get_store_backend") + def test_returns_true_when_exists(self, mock_get_backend, mock_get_subfolding): + """Test that True is returned when content exists.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + mock_get_subfolding.return_value = None + + content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 + assert hash_exists(content_hash, schema_name="test_schema", store_name="test_store") is True + + @patch("datajoint.hash_registry.get_store_subfolding") + @patch("datajoint.hash_registry.get_store_backend") + def test_returns_false_when_not_exists(self, mock_get_backend, mock_get_subfolding): + """Test that False is returned when content doesn't exist.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + mock_get_subfolding.return_value = None + + content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 + assert hash_exists(content_hash, schema_name="test_schema", store_name="test_store") is False + + +class TestDeletePath: + """Tests for delete_path function.""" + + @patch("datajoint.hash_registry.get_store_backend") + def test_deletes_existing_content(self, mock_get_backend): + """Test deleting existing content by path.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + + path = "_hash/test_schema/abcdefghijklmnopqrst" + result = delete_path(path, store_name="test_store") + + assert result is True + mock_backend.remove.assert_called_once_with(path) + + @patch("datajoint.hash_registry.get_store_backend") + def test_returns_false_for_nonexistent(self, mock_get_backend): + """Test that False is returned when content doesn't exist.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = False + mock_get_backend.return_value = mock_backend + + path = "_hash/test_schema/abcdefghijklmnopqrst" + result = delete_path(path, store_name="test_store") + + assert result is False + mock_backend.remove.assert_not_called() + + +class TestDeleteHash: + """Tests for delete_hash function (backward compatibility).""" + + @patch("datajoint.hash_registry.get_store_subfolding") + @patch("datajoint.hash_registry.get_store_backend") + def test_deletes_existing_content(self, mock_get_backend, mock_get_subfolding): + """Test deleting existing content by hash.""" + mock_backend = MagicMock() + mock_backend.exists.return_value = True + mock_get_backend.return_value = mock_backend + mock_get_subfolding.return_value = None + + content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 + result = delete_hash(content_hash, schema_name="test_schema", store_name="test_store") + + assert result is True + mock_backend.remove.assert_called_once() + + +class TestGetSize: + """Tests for get_size function.""" + + @patch("datajoint.hash_registry.get_store_backend") + def test_returns_size(self, mock_get_backend): + """Test getting content size by path.""" + mock_backend = MagicMock() + mock_backend.size.return_value = 1024 + mock_get_backend.return_value = mock_backend + + path = "_hash/test_schema/abcdefghijklmnopqrst" + result = get_size(path, store_name="test_store") + + assert result == 1024 + mock_backend.size.assert_called_once_with(path) + + +class TestGetHashSize: + """Tests for get_hash_size function (backward compatibility).""" + + @patch("datajoint.hash_registry.get_store_subfolding") + @patch("datajoint.hash_registry.get_store_backend") + def test_returns_size(self, mock_get_backend, mock_get_subfolding): + """Test getting content size by hash.""" + mock_backend = MagicMock() + mock_backend.size.return_value = 1024 + mock_get_backend.return_value = mock_backend + mock_get_subfolding.return_value = None + + content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 + result = get_hash_size(content_hash, schema_name="test_schema", store_name="test_store") + + assert result == 1024 From d2ab4de23b4f8febfd72ee07aa3084f02b36b7ef Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 13 Jan 2026 14:05:24 -0600 Subject: [PATCH 09/10] refactor: Remove uuid_from_buffer, use hashlib directly for query cache - Remove uuid_from_buffer from hash.py (dead code) - connection.py now uses hashlib.md5().hexdigest() directly - Update test_hash.py to test key_hash instead Co-Authored-By: Claude Opus 4.5 --- src/datajoint/connection.py | 4 ++-- src/datajoint/hash.py | 14 -------------- tests/unit/test_hash.py | 8 +++++--- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/src/datajoint/connection.py b/src/datajoint/connection.py index 57301c2f3..219e97d98 100644 --- a/src/datajoint/connection.py +++ b/src/datajoint/connection.py @@ -5,6 +5,7 @@ from __future__ import annotations +import hashlib import logging import pathlib import re @@ -18,7 +19,6 @@ from . import errors from .blob import pack, unpack from .dependencies import Dependencies -from .hash import uuid_from_buffer from .settings import config from .version import __version__ @@ -418,7 +418,7 @@ def query( if use_query_cache: if not config[cache_key]: raise errors.DataJointError(f"Provide filepath dj.config['{cache_key}'] when using query caching.") - hash_ = uuid_from_buffer((str(self._query_cache) + re.sub(r"`\$\w+`", "", query)).encode() + pack(args)) + hash_ = hashlib.md5((str(self._query_cache) + re.sub(r"`\$\w+`", "", query)).encode() + pack(args)).hexdigest() cache_path = pathlib.Path(config[cache_key]) / str(hash_) try: buffer = cache_path.read_bytes() diff --git a/src/datajoint/hash.py b/src/datajoint/hash.py index 2a58e9bf4..58f87b88e 100644 --- a/src/datajoint/hash.py +++ b/src/datajoint/hash.py @@ -1,7 +1,6 @@ from __future__ import annotations import hashlib -import uuid from typing import Any @@ -14,16 +13,3 @@ def key_hash(mapping: dict[str, Any]) -> str: for k, v in sorted(mapping.items()): hashed.update(str(v).encode()) return hashed.hexdigest() - - -def uuid_from_buffer(buffer: bytes = b"", *, init_string: str = "") -> uuid.UUID: - """ - Compute MD5 hash of buffer data, returned as UUID. - - :param buffer: bytes to hash - :param init_string: string to initialize the checksum (for namespacing) - :return: UUID based on MD5 digest - """ - hashed = hashlib.md5(init_string.encode()) - hashed.update(buffer) - return uuid.UUID(bytes=hashed.digest()) diff --git a/tests/unit/test_hash.py b/tests/unit/test_hash.py index a88c45316..125ab4dbe 100644 --- a/tests/unit/test_hash.py +++ b/tests/unit/test_hash.py @@ -1,6 +1,8 @@ from datajoint import hash -def test_hash(): - assert hash.uuid_from_buffer(b"abc").hex == "900150983cd24fb0d6963f7d28e17f72" - assert hash.uuid_from_buffer(b"").hex == "d41d8cd98f00b204e9800998ecf8427e" +def test_key_hash(): + """Test that key_hash produces consistent MD5 hex digests.""" + assert hash.key_hash({"a": 1, "b": 2}) == hash.key_hash({"b": 2, "a": 1}) + assert hash.key_hash({"x": "hello"}) == "5d41402abc4b2a76b9719d911017c592" + assert hash.key_hash({}) == "d41d8cd98f00b204e9800998ecf8427e" From 58f2b67a2b8271e8a86076461c0c4c100ddce832 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 13 Jan 2026 14:10:47 -0600 Subject: [PATCH 10/10] refactor: Remove unused functions from hash_registry Remove dead code that was only tested but never used in production: - hash_exists (gc uses set operations on paths) - delete_hash (gc uses delete_path directly) - get_size (gc collects sizes during walk) - get_hash_size (wrapper for get_size) Remaining API: compute_hash, build_hash_path, get_store_backend, get_store_subfolding, put_hash, get_hash, delete_path Co-Authored-By: Claude Opus 4.5 --- src/datajoint/hash_registry.py | 113 ------------------------- tests/integration/test_hash_storage.py | 86 ------------------- 2 files changed, 199 deletions(-) diff --git a/src/datajoint/hash_registry.py b/src/datajoint/hash_registry.py index 7b286e874..5033f13e5 100644 --- a/src/datajoint/hash_registry.py +++ b/src/datajoint/hash_registry.py @@ -267,34 +267,6 @@ def get_hash(metadata: dict[str, Any]) -> bytes: return data -def hash_exists( - content_hash: str, - schema_name: str, - store_name: str | None = None, -) -> bool: - """ - Check if hash-addressed content exists in storage. - - Parameters - ---------- - content_hash : str - Base32-encoded hash (26 characters). - schema_name : str - Database/schema name for path isolation. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - bool - True if content exists. - """ - subfolding = get_store_subfolding(store_name) - path = build_hash_path(content_hash, schema_name, subfolding) - backend = get_store_backend(store_name) - return backend.exists(path) - - def delete_path( path: str, store_name: str | None = None, @@ -328,88 +300,3 @@ def delete_path( logger.debug(f"Deleted: {path}") return True return False - - -# Backward compatibility alias -def delete_hash( - content_hash: str, - schema_name: str, - store_name: str | None = None, -) -> bool: - """ - Delete hash-addressed content from storage (deprecated). - - .. deprecated:: - Use :func:`delete_path` with the stored path instead. - - Parameters - ---------- - content_hash : str - Base32-encoded hash (26 characters). - schema_name : str - Database/schema name for path isolation. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - bool - True if content was deleted, False if it didn't exist. - """ - subfolding = get_store_subfolding(store_name) - path = build_hash_path(content_hash, schema_name, subfolding) - return delete_path(path, store_name) - - -def get_size( - path: str, - store_name: str | None = None, -) -> int: - """ - Get the size of content at the specified path. - - Parameters - ---------- - path : str - Storage path (as stored in metadata). - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - int - Size in bytes. - """ - backend = get_store_backend(store_name) - return backend.size(path) - - -# Backward compatibility alias -def get_hash_size( - content_hash: str, - schema_name: str, - store_name: str | None = None, -) -> int: - """ - Get the size of hash-addressed content (deprecated). - - .. deprecated:: - Use :func:`get_size` with the stored path instead. - - Parameters - ---------- - content_hash : str - Base32-encoded hash (26 characters). - schema_name : str - Database/schema name for path isolation. - store_name : str, optional - Name of the store. If None, uses default store. - - Returns - ------- - int - Size in bytes. - """ - subfolding = get_store_subfolding(store_name) - path = build_hash_path(content_hash, schema_name, subfolding) - return get_size(path, store_name) diff --git a/tests/integration/test_hash_storage.py b/tests/integration/test_hash_storage.py index 8cd8b5b93..bc1c61a4d 100644 --- a/tests/integration/test_hash_storage.py +++ b/tests/integration/test_hash_storage.py @@ -10,12 +10,8 @@ from datajoint.hash_registry import ( build_hash_path, compute_hash, - hash_exists, delete_path, - delete_hash, get_hash, - get_size, - get_hash_size, put_hash, ) from datajoint.errors import DataJointError @@ -192,34 +188,6 @@ def test_verifies_hash(self, mock_get_backend): get_hash(metadata) -class TestHashExists: - """Tests for hash_exists function.""" - - @patch("datajoint.hash_registry.get_store_subfolding") - @patch("datajoint.hash_registry.get_store_backend") - def test_returns_true_when_exists(self, mock_get_backend, mock_get_subfolding): - """Test that True is returned when content exists.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = True - mock_get_backend.return_value = mock_backend - mock_get_subfolding.return_value = None - - content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 - assert hash_exists(content_hash, schema_name="test_schema", store_name="test_store") is True - - @patch("datajoint.hash_registry.get_store_subfolding") - @patch("datajoint.hash_registry.get_store_backend") - def test_returns_false_when_not_exists(self, mock_get_backend, mock_get_subfolding): - """Test that False is returned when content doesn't exist.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = False - mock_get_backend.return_value = mock_backend - mock_get_subfolding.return_value = None - - content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 - assert hash_exists(content_hash, schema_name="test_schema", store_name="test_store") is False - - class TestDeletePath: """Tests for delete_path function.""" @@ -248,57 +216,3 @@ def test_returns_false_for_nonexistent(self, mock_get_backend): assert result is False mock_backend.remove.assert_not_called() - - -class TestDeleteHash: - """Tests for delete_hash function (backward compatibility).""" - - @patch("datajoint.hash_registry.get_store_subfolding") - @patch("datajoint.hash_registry.get_store_backend") - def test_deletes_existing_content(self, mock_get_backend, mock_get_subfolding): - """Test deleting existing content by hash.""" - mock_backend = MagicMock() - mock_backend.exists.return_value = True - mock_get_backend.return_value = mock_backend - mock_get_subfolding.return_value = None - - content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 - result = delete_hash(content_hash, schema_name="test_schema", store_name="test_store") - - assert result is True - mock_backend.remove.assert_called_once() - - -class TestGetSize: - """Tests for get_size function.""" - - @patch("datajoint.hash_registry.get_store_backend") - def test_returns_size(self, mock_get_backend): - """Test getting content size by path.""" - mock_backend = MagicMock() - mock_backend.size.return_value = 1024 - mock_get_backend.return_value = mock_backend - - path = "_hash/test_schema/abcdefghijklmnopqrst" - result = get_size(path, store_name="test_store") - - assert result == 1024 - mock_backend.size.assert_called_once_with(path) - - -class TestGetHashSize: - """Tests for get_hash_size function (backward compatibility).""" - - @patch("datajoint.hash_registry.get_store_subfolding") - @patch("datajoint.hash_registry.get_store_backend") - def test_returns_size(self, mock_get_backend, mock_get_subfolding): - """Test getting content size by hash.""" - mock_backend = MagicMock() - mock_backend.size.return_value = 1024 - mock_get_backend.return_value = mock_backend - mock_get_subfolding.return_value = None - - content_hash = "abcdefghijklmnopqrstuvwxyz"[:26] # Valid base32 - result = get_hash_size(content_hash, schema_name="test_schema", store_name="test_store") - - assert result == 1024