"""
YAML loader with provenance tracking.
Provides ``ProvenanceConstructor`` which captures file, line, and column
information for every value during YAML parsing.
"""
import os
from pathlib import Path
from ruamel.yaml import YAML
from ruamel.yaml.constructor import RoundTripConstructor
from ._config import get_config
from ._dict import DictWithProvenance
[docs]
class ProvenanceConstructor(RoundTripConstructor):
"""
A YAML constructor that captures provenance (line, column) for every node.
Instead of returning plain values, returns ``(data, (line, col))`` tuples.
These can then be split into a data dict and a provenance dict for use
with ``DictWithProvenance``.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
[docs]
def construct_object(self, node, *args, **kwargs):
data = super().construct_object(node, *args, **kwargs)
provenance = (
node.start_mark.line + 1,
node.start_mark.column + 1,
)
return (data, provenance)
def _is_prov_tuple(val):
"""Check if a value is a (data, (line, col)) provenance tuple."""
return (
isinstance(val, tuple)
and len(val) == 2
and isinstance(val[1], tuple)
and len(val[1]) == 2
and isinstance(val[1][0], int)
and isinstance(val[1][1], int)
)
[docs]
class ProvenanceLoader:
"""
High-level YAML loader that produces ``DictWithProvenance`` objects.
Parameters
----------
category_resolver : callable or None
A function ``(filepath: str) -> (category, subcategory)`` that maps
file paths to categories. Default: returns ``(None, None)``.
config : ProvenanceConfig or None
Configuration for provenance tracking. If ``None``, uses module default.
"""
def __init__(self, category_resolver=None, config=None):
self._category_resolver = category_resolver or (lambda f: (None, None))
self._config = config or get_config()
self._yaml = YAML()
self._yaml.Constructor = ProvenanceConstructor
[docs]
def load(self, filepath):
"""
Load a YAML file and return a ``DictWithProvenance``.
Parameters
----------
filepath : str or Path
Path to the YAML file.
Returns
-------
DictWithProvenance
The loaded data with provenance tracking.
"""
filepath = str(filepath)
category, subcategory = self._category_resolver(filepath)
with open(filepath, "r") as f:
raw = self._yaml.load(f)
if raw is None:
return DictWithProvenance({}, {}, config=self._config)
# The root node is also wrapped: (dict_with_tuples, (line, col))
if _is_prov_tuple(raw):
raw = raw[0]
data, provenance = self._split_dict(raw, filepath, category, subcategory)
return DictWithProvenance(data, provenance, config=self._config)
def _split_dict(self, raw_dict, filepath, category, subcategory):
"""
Recursively split a dict whose keys and values are provenance tuples
into separate data and provenance dicts.
"""
data = {}
prov = {}
for raw_key, raw_val in raw_dict.items():
# Unwrap key
if _is_prov_tuple(raw_key):
key = raw_key[0]
else:
key = raw_key
# Unwrap value
if _is_prov_tuple(raw_val):
val, (line, col) = raw_val
if isinstance(val, dict):
data[key], prov[key] = self._split_dict(
val, filepath, category, subcategory
)
elif isinstance(val, list):
data[key], prov[key] = self._split_list(
val, filepath, category, subcategory
)
else:
data[key] = val
prov[key] = {
"line": line,
"col": col,
"yaml_file": filepath,
"category": category,
"subcategory": subcategory,
}
else:
data[key] = raw_val
prov[key] = {}
return data, prov
def _split_list(self, raw_list, filepath, category, subcategory):
"""
Recursively split a list whose elements are provenance tuples.
"""
data = []
prov = []
for item in raw_list:
if _is_prov_tuple(item):
val, (line, col) = item
if isinstance(val, dict):
d, p = self._split_dict(val, filepath, category, subcategory)
data.append(d)
prov.append(p)
elif isinstance(val, list):
d, p = self._split_list(val, filepath, category, subcategory)
data.append(d)
prov.append(p)
else:
data.append(val)
prov.append({
"line": line,
"col": col,
"yaml_file": filepath,
"category": category,
"subcategory": subcategory,
})
else:
data.append(item)
prov.append({})
return data, prov
[docs]
def load_yaml(filepath, category_resolver=None, config=None):
"""
Convenience function to load a YAML file with provenance tracking.
Parameters
----------
filepath : str or Path
Path to the YAML file.
category_resolver : callable or None
Maps file paths to ``(category, subcategory)`` tuples.
config : ProvenanceConfig or None
Configuration for provenance tracking.
Returns
-------
DictWithProvenance
The loaded data with provenance.
"""
loader = ProvenanceLoader(category_resolver=category_resolver, config=config)
return loader.load(filepath)