Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions docs/inspecting_datasets.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# Inspecting Dataset Schemas

The Registry client and Admin API on standard client provide convenient methods to explore dataset structures without manually parsing manifests.

## Quick Start

```python
from amp.registry import RegistryClient

client = RegistryClient() # Note: Inspection functionality is also available on the Admin api of the regular client (Client())

# Pretty-print dataset structure
client.datasets.inspect('edgeandnode', 'ethereum-mainnet')

# Get structured schema data
schema = client.datasets.describe('edgeandnode', 'ethereum-mainnet')
```

## Methods

### `inspect(namespace, name, version='latest')`

Pretty-prints the dataset structure in a human-readable format. Perfect for interactive exploration.

**Example Output:**
```
Dataset: edgeandnode/ethereum-mainnet@latest
Description: Ethereum mainnet blockchain data

📊 blocks (21 columns)
• block_num UInt64 NOT NULL
• timestamp Timestamp(Nanosecond) NOT NULL
• hash FixedSizeBinary(32) NOT NULL
• parent_hash FixedSizeBinary(32) NOT NULL
• miner FixedSizeBinary(20) NOT NULL
...

📊 transactions (24 columns)
• block_num UInt64 NOT NULL
• tx_hash FixedSizeBinary(32) NOT NULL
• from FixedSizeBinary(20) NOT NULL
• to FixedSizeBinary(20) NULL
...
```

### `describe(namespace, name, version='latest')`

Returns a structured dictionary mapping table names to column information. Use this for programmatic access.

**Returns:**
```python
{
'blocks': [
{'name': 'block_num', 'type': 'UInt64', 'nullable': False},
{'name': 'timestamp', 'type': 'Timestamp(Nanosecond)', 'nullable': False},
{'name': 'hash', 'type': 'FixedSizeBinary(32)', 'nullable': False},
...
],
'transactions': [
{'name': 'tx_hash', 'type': 'FixedSizeBinary(32)', 'nullable': False},
...
]
}
```

## Use Cases

### 1. Interactive Exploration

```python
# Quickly see what's available
client.datasets.inspect('namespace', 'dataset-name')
```

### 2. Finding Specific Columns

```python
schema = client.datasets.describe('namespace', 'dataset-name')

# Find tables with specific columns
for table_name, columns in schema.items():
col_names = [col['name'] for col in columns]
if 'address' in col_names:
print(f"Table '{table_name}' has an address column")
```

### 3. Finding Ethereum Addresses

```python
schema = client.datasets.describe('namespace', 'dataset-name')

# Find all address columns (20-byte binary fields)
for table_name, columns in schema.items():
address_cols = [col['name'] for col in columns if col['type'] == 'FixedSizeBinary(20)']
if address_cols:
print(f"{table_name}: {', '.join(address_cols)}")

# Example output:
# blocks: miner
# transactions: from, to
# logs: address
```

### 4. Finding Transaction/Block Hashes

```python
schema = client.datasets.describe('namespace', 'dataset-name')

# Find all hash columns (32-byte binary fields)
for table_name, columns in schema.items():
hash_cols = [col['name'] for col in columns if col['type'] == 'FixedSizeBinary(32)']
if hash_cols:
print(f"{table_name}: {', '.join(hash_cols)}")

# Example output:
# blocks: hash, parent_hash, state_root, transactions_root
# transactions: block_hash, tx_hash
# logs: block_hash, tx_hash, topic0, topic1, topic2, topic3
```

### 5. Checking Nullable Columns

```python
schema = client.datasets.describe('namespace', 'dataset-name')

# Find columns that allow NULL values (important for data quality)
for table_name, columns in schema.items():
nullable_cols = [col['name'] for col in columns if col['nullable']]
print(f"{table_name}: {len(nullable_cols)}/{len(columns)} nullable columns")
print(f" Nullable: {', '.join(nullable_cols[:5])}")

# Example output:
# transactions: 5/24 nullable columns
# Nullable: to, gas_price, value, max_fee_per_gas, max_priority_fee_per_gas
```

### 6. Building Dynamic Queries

```python
from amp import Client

registry_client = RegistryClient()
client = Client(
query_url='grpc://localhost:1602',
admin_url='http://localhost:8080',
auth=True
)

# Discover available tables
schema = registry_client.datasets.describe('namespace', 'dataset-name')
print(f"Available tables: {list(schema.keys())}")

# Build query based on available columns
if 'blocks' in schema:
block_cols = [col['name'] for col in schema['blocks']]
if 'block_num' in block_cols and 'timestamp' in block_cols:
# Safe to query these columns
result = client.sql("SELECT block_num, timestamp FROM blocks LIMIT 10")
```

## Supported Arrow Types

The `describe()` and `inspect()` methods handle these Arrow types:

- **Primitives**: `UInt64`, `Int32`, `Boolean`, `Binary`
- **Timestamps**: `Timestamp(Nanosecond)`, `Timestamp(Microsecond)`, etc.
- **Fixed-size Binary**: `FixedSizeBinary(20)` (addresses), `FixedSizeBinary(32)` (hashes)
- **Decimals**: `Decimal128(38,0)` (large integers), `Decimal128(18,6)` (fixed-point)

## Complete Example

```python
from amp.registry import RegistryClient
from amp import Client

# Step 1: Discover datasets
registry = RegistryClient()
results = registry.datasets.search('ethereum blocks')

print("Available datasets:")
for ds in results.datasets[:5]:
print(f" • {ds.namespace}/{ds.name}")

# Step 2: Inspect a dataset
print("\nInspecting dataset structure:")
registry.datasets.inspect('graphops', 'ethereum-mainnet')

# Step 3: Get schema programmatically
schema = registry.datasets.describe('graphops', 'ethereum-mainnet')

# Step 4: Query based on discovered schema
client = Client(query_url='grpc://your-server:1602', auth=True)

# Find tables with block_num column
tables_with_blocks = [
table for table, cols in schema.items()
if any(col['name'] == 'block_num' for col in cols)
]

for table in tables_with_blocks:
print(f"\nQuerying {table}...")
results = client.sql(f"SELECT * FROM {table} LIMIT 5").to_arrow()
print(f" Rows: {len(results)}")
```

## Tips

1. **Use `inspect()` interactively**: Great for Jupyter notebooks or REPL exploration
2. **Use `describe()` in scripts**: When you need programmatic access to schema info
3. **Check nullability**: The `nullable` field tells you if a column can have NULL values
4. **Version pinning**: Always specify a version in production (`version='1.2.3'`) instead of using `'latest'`
75 changes: 74 additions & 1 deletion src/amp/admin/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
including registration, deployment, versioning, and manifest operations.
"""

from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, Dict, Optional

from amp.utils.manifest_inspector import describe_manifest, print_schema

from . import models

Expand Down Expand Up @@ -198,6 +200,77 @@ def get_manifest(self, namespace: str, name: str, revision: str) -> dict:
response = self._admin._request('GET', path)
return response.json()

def describe(self, namespace: str, name: str, revision: str = 'latest') -> Dict[str, list[Dict[str, str | bool]]]:
"""Get a structured summary of tables and columns in a dataset.

Returns a dictionary mapping table names to lists of column information,
making it easy to programmatically inspect the dataset schema.

Args:
namespace: Dataset namespace
name: Dataset name
revision: Version tag (default: 'latest')

Returns:
dict: Mapping of table names to column information. Each column is a dict with:
- name: Column name (str)
- type: Arrow type (str, simplified representation)
- nullable: Whether the column allows NULL values (bool)

Example:
>>> client = AdminClient('http://localhost:8080')
>>> schema = client.datasets.describe('_', 'eth_firehose', 'latest')
>>> for table_name, columns in schema.items():
... print(f"\\nTable: {table_name}")
... for col in columns:
... nullable = "NULL" if col['nullable'] else "NOT NULL"
... print(f" {col['name']}: {col['type']} {nullable}")
"""
manifest = self.get_manifest(namespace, name, revision)
return describe_manifest(manifest)

def inspect(self, namespace: str, name: str, revision: str = 'latest') -> None:
"""Pretty-print the structure of a dataset for easy inspection.

Displays tables and their columns in a human-readable format.
This is perfect for exploring datasets interactively.

Args:
namespace: Dataset namespace
name: Dataset name
revision: Version tag (default: 'latest')

Example:
>>> client = AdminClient('http://localhost:8080')
>>> client.datasets.inspect('_', 'eth_firehose')
Dataset: _/eth_firehose@latest

blocks (21 columns)
block_num UInt64 NOT NULL
timestamp Timestamp NOT NULL
hash FixedSizeBinary(32) NOT NULL
...

transactions (24 columns)
tx_hash FixedSizeBinary(32) NOT NULL
from FixedSizeBinary(20) NOT NULL
to FixedSizeBinary(20) NULL
...
"""
header = f'Dataset: {namespace}/{name}@{revision}'

# Try to get version info for additional context (optional, might not always work)
try:
version_info = self.get_version(namespace, name, revision)
if hasattr(version_info, 'kind'):
header += f'\nKind: {version_info.kind}'
except Exception:
# If we can't get version info, that's okay - just continue
pass

schema = self.describe(namespace, name, revision)
print_schema(schema, header=header)

def delete(self, namespace: str, name: str) -> None:
"""Delete all versions and metadata for a dataset.

Expand Down
68 changes: 68 additions & 0 deletions src/amp/registry/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import logging
from typing import TYPE_CHECKING, Any, Dict, Optional

from amp.utils.manifest_inspector import describe_manifest, print_schema

from . import models

if TYPE_CHECKING:
Expand Down Expand Up @@ -197,6 +199,72 @@ def get_manifest(self, namespace: str, name: str, version: str) -> dict:
response = self._registry._request('GET', path)
return response.json()

def describe(self, namespace: str, name: str, version: str = 'latest') -> Dict[str, list[Dict[str, str | bool]]]:
"""Get a structured summary of tables and columns in a dataset.

Returns a dictionary mapping table names to lists of column information,
making it easy to programmatically inspect the dataset schema.

Args:
namespace: Dataset namespace
name: Dataset name
version: Version tag (default: 'latest')

Returns:
dict: Mapping of table names to column information. Each column is a dict with:
- name: Column name (str)
- type: Arrow type (str, simplified representation)
- nullable: Whether the column allows NULL values (bool)

Example:
>>> client = RegistryClient()
>>> schema = client.datasets.describe('edgeandnode', 'ethereum-mainnet', 'latest')
>>> for table_name, columns in schema.items():
... print(f"\\nTable: {table_name}")
... for col in columns:
... nullable = "NULL" if col['nullable'] else "NOT NULL"
... print(f" {col['name']}: {col['type']} {nullable}")
"""
manifest = self.get_manifest(namespace, name, version)
return describe_manifest(manifest)

def inspect(self, namespace: str, name: str, version: str = 'latest') -> None:
"""Pretty-print the structure of a dataset for easy inspection.

Displays tables and their columns in a human-readable format.
This is perfect for exploring datasets interactively.

Args:
namespace: Dataset namespace
name: Dataset name
version: Version tag (default: 'latest')

Example:
>>> client = RegistryClient()
>>> client.datasets.inspect('graphops', 'ethereum-mainnet')
Dataset: graphops/ethereum-mainnet@latest

blocks (4 columns)
block_num UInt64 NOT NULL
timestamp Timestamp NOT NULL
hash FixedSizeBinary(32) NOT NULL
parent_hash FixedSizeBinary(32) NOT NULL

transactions (23 columns)
block_num UInt64 NOT NULL
tx_hash FixedSizeBinary(32) NOT NULL
...
"""
# Get dataset info
dataset = self.get(namespace, name)
header = f'Dataset: {namespace}/{name}@{version}'
if dataset.description:
header += f'\nDescription: {dataset.description}'

# Get schema and print
schema = self.describe(namespace, name, version)
print_schema(schema, header=header)

# Write Operations (Require Authentication)

def publish(
Expand Down
5 changes: 5 additions & 0 deletions src/amp/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Utility modules for amp Python client."""

from .manifest_inspector import describe_manifest, format_arrow_type, print_schema

__all__ = ['describe_manifest', 'format_arrow_type', 'print_schema']
Loading
Loading