Markdown Section Seed Reader

Turn a directory of Markdown files into a seed dataset with one row per section. This recipe stays in the same single-file format as the other recipes: it creates sample files, defines an inline FileSystemSeedReader[DirectorySeedSource], and passes that reader to DataDesigner(seed_readers=[...]).

This keeps the example focused on the actual seed reader contract:

implementing build_manifest(...)
returning 1:N hydrated rows from hydrate_row(...)
declaring output_columns for the hydrated schema
keeping IndexRange selection manifest-based

Because the example reuses DirectorySeedSource, it does not register a brand-new seed_type. If you later want to package the same reader as an installable plugin, see FileSystemSeedReader Plugins.

Run the Recipe

Run the script directly:

uv run markdown_seed_reader.py

The script prints two previews:

the full section dataset across all Markdown files
a manifest-only selection using IndexRange(start=1, end=1) that still returns every section from the selected file

Download Code

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "data-designer",
# ]
# ///
"""Markdown Section Seed Reader Recipe

Prototype a custom FileSystemSeedReader inline by overriding how one
DataDesigner instance handles DirectorySeedSource inputs. The reader keeps a
file-based manifest and fans each Markdown file out into one row per section.
This keeps the example in the same single-file format as the other recipes
while still showing the core `build_manifest(...)` and `hydrate_row(...)`
contract for a custom filesystem-backed seed reader.

Run:
    uv run markdown_seed_reader.py
"""

from __future__ import annotations

import re
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, ClassVar

import data_designer.config as dd
from data_designer.config.seed import IndexRange
from data_designer.engine.resources.seed_reader import FileSystemSeedReader, SeedReaderFileSystemContext
from data_designer.interface import DataDesigner

_ATX_HEADING_PATTERN = re.compile(r"^(#{1,6})[ \t]+(.+?)\s*$")


class MarkdownSectionDirectorySeedReader(FileSystemSeedReader[dd.DirectorySeedSource]):
    """Turn each Markdown file matched by DirectorySeedSource into section rows."""

    output_columns: ClassVar[list[str]] = [
        "relative_path",
        "file_name",
        "section_index",
        "section_header",
        "section_content",
    ]

    def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]:
        """Return one cheap manifest row per matched Markdown file."""

        matched_paths = self.get_matching_relative_paths(
            context=context,
            file_pattern=self.source.file_pattern,
            recursive=self.source.recursive,
        )
        return [
            {
                "relative_path": relative_path,
                "file_name": Path(relative_path).name,
            }
            for relative_path in matched_paths
        ]

    def hydrate_row(
        self,
        *,
        manifest_row: dict[str, Any],
        context: SeedReaderFileSystemContext,
    ) -> list[dict[str, Any]]:
        """Read one Markdown file and fan it out into one record per heading section."""

        relative_path = str(manifest_row["relative_path"])
        file_name = str(manifest_row["file_name"])
        with context.fs.open(relative_path, "r", encoding="utf-8") as handle:
            markdown_text = handle.read()

        sections = extract_markdown_sections(markdown_text=markdown_text, fallback_header=file_name)
        return [
            {
                "relative_path": relative_path,
                "file_name": file_name,
                "section_index": section_index,
                "section_header": section_header,
                "section_content": section_content,
            }
            for section_index, (section_header, section_content) in enumerate(sections)
        ]


def extract_markdown_sections(*, markdown_text: str, fallback_header: str) -> list[tuple[str, str]]:
    """Split Markdown into `(header, content)` pairs using ATX headings."""

    sections: list[tuple[str, str]] = []
    current_header = fallback_header
    current_lines: list[str] = []
    saw_heading = False

    for line in markdown_text.splitlines():
        heading_match = _ATX_HEADING_PATTERN.match(line)
        if heading_match is not None:
            if saw_heading or any(existing_line.strip() for existing_line in current_lines):
                sections.append((current_header, "\n".join(current_lines).strip()))
            current_header = heading_match.group(2).strip()
            current_lines = []
            saw_heading = True
            continue
        current_lines.append(line)

    if saw_heading or markdown_text.strip():
        sections.append((current_header, "\n".join(current_lines).strip()))

    return [
        (section_header, section_content)
        for section_header, section_content in sections
        if section_header or section_content
    ]


def create_sample_markdown_files(seed_dir: Path) -> None:
    """Create a tiny Markdown corpus that keeps the recipe self-contained."""

    (seed_dir / "faq.md").write_text(
        "# FAQ\nAnswers to frequent questions.\n\n## Support\nContact support@example.com.",
        encoding="utf-8",
    )
    (seed_dir / "guide.md").write_text(
        "# Quickstart\nInstall Data Designer.\n\n## Usage\nRun the recipe with uv.",
        encoding="utf-8",
    )


def build_config(
    *,
    seed_path: Path,
    selection_strategy: IndexRange | None = None,
) -> dd.DataDesignerConfigBuilder:
    """Create the dataset config used by both preview runs in the recipe."""

    config_builder = dd.DataDesignerConfigBuilder()
    config_builder.with_seed_dataset(
        dd.DirectorySeedSource(path=str(seed_path), file_pattern="*.md"),
        selection_strategy=selection_strategy,
    )
    config_builder.add_column(
        dd.ExpressionColumnConfig(
            name="section_summary",
            expr="{{ file_name }} :: {{ section_header }}",
        )
    )
    return config_builder


def print_preview(
    *,
    data_designer: DataDesigner,
    title: str,
    config_builder: dd.DataDesignerConfigBuilder,
    num_records: int,
) -> None:
    """Run a preview and print the columns that matter for the walkthrough."""

    print(title)
    preview = data_designer.preview(config_builder, num_records=num_records)
    print(
        preview.dataset[
            [
                "relative_path",
                "section_index",
                "section_header",
                "section_summary",
            ]
        ].to_string(index=False)
    )
    print()


def main() -> None:
    """Build sample input files and print previews with and without selection."""

    with TemporaryDirectory(prefix="markdown-seed-reader-") as temp_dir:
        seed_dir = Path(temp_dir) / "sample_markdown"
        seed_dir.mkdir()
        create_sample_markdown_files(seed_dir)

        data_designer = DataDesigner(seed_readers=[MarkdownSectionDirectorySeedReader()])

        print_preview(
            data_designer=data_designer,
            title="Full preview across all markdown files",
            config_builder=build_config(seed_path=seed_dir),
            num_records=4,
        )
        print_preview(
            data_designer=data_designer,
            title="Manifest-based selection of only the second matched file",
            config_builder=build_config(
                seed_path=seed_dir,
                selection_strategy=IndexRange(start=1, end=1),
            ),
            num_records=2,
        )


if __name__ == "__main__":
    main()