digiDocs/tools/generate_index.py

#!/usr/bin/env python3
"""
Generate .docs-index.json for digiDocs repository.

This script scans all markdown files, extracts YAML frontmatter metadata,
and creates a machine-readable JSON index for search and discovery.
"""

import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional


def extract_yaml_frontmatter(content: str) -> Optional[Dict]:
    """
    Extract YAML frontmatter from markdown content.

    Args:
        content: Markdown file content

    Returns:
        Dict with metadata or None if no frontmatter found
    """
    pattern = r'^---\s*\n(.*?)\n---\s*\n'
    match = re.match(pattern, content, re.DOTALL)

    if not match:
        return None

    frontmatter = {}
    yaml_content = match.group(1)

    for line in yaml_content.split('\n'):
        line = line.strip()
        if not line or line.startswith('#'):
            continue

        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip()

            # Handle list values in square brackets
            if value.startswith('[') and value.endswith(']'):
                value = [v.strip() for v in value[1:-1].split(',') if v.strip()]
            # Handle quoted strings
            elif value.startswith('"') and value.endswith('"'):
                value = value[1:-1]
            elif value.startswith("'") and value.endswith("'"):
                value = value[1:-1]

            frontmatter[key] = value

    return frontmatter


def extract_description_fallback(content: str, title: str) -> str:
    """
    Extract description from first paragraph if not in frontmatter.

    Args:
        content: Markdown file content
        title: Document title

    Returns:
        Description string (max 200 chars)
    """
    # Remove frontmatter
    content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)

    # Remove title lines
    lines = content.split('\n')
    filtered_lines = []
    for line in lines:
        stripped = line.strip()
        if stripped and not stripped.startswith('#'):
            if stripped != title and not stripped.startswith('**'):
                filtered_lines.append(stripped)

    if filtered_lines:
        description = ' '.join(filtered_lines[:3])
        if len(description) > 200:
            description = description[:197] + '...'
        return description

    return ""


def derive_category_from_path(relative_path: str) -> str:
    """
    Derive category from file path.

    Args:
        relative_path: Relative path to markdown file

    Returns:
        Category string (lowercase, hyphenated)
    """
    parts = Path(relative_path).parts

    if len(parts) == 1:
        return "root"

    folder = parts[0]
    return folder.lower().replace(' ', '-').replace('_', '-')


def scan_markdown_files(repo_path: Path) -> List[Dict]:
    """
    Scan all markdown files in repository and extract metadata.

    Args:
        repo_path: Path to repository root

    Returns:
        List of document metadata dicts
    """
    docs = []

    # Patterns to exclude
    exclude_patterns = [
        '.git',
        'node_modules',
        '.forgejo',
        '.github',
    ]

    for md_file in repo_path.rglob('*.md'):
        # Skip excluded paths
        if any(pattern in str(md_file) for pattern in exclude_patterns):
            continue

        relative_path = md_file.relative_to(repo_path)

        try:
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()

            # Extract frontmatter
            frontmatter = extract_yaml_frontmatter(content)

            if not frontmatter:
                # Skip files without frontmatter
                continue

            # Extract title (required)
            title = frontmatter.get('title', '')
            if not title:
                # Try to extract from first heading
                heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
                if heading_match:
                    title = heading_match.group(1).strip()
                else:
                    title = md_file.stem.replace('_', ' ').replace('-', ' ')

            # Extract description
            description = frontmatter.get('description', '')
            if not description:
                description = extract_description_fallback(content, title)

            # Extract category
            category = frontmatter.get('category', '')
            if not category:
                category = derive_category_from_path(str(relative_path))

            # Extract tags
            tags = frontmatter.get('tags', [])
            if isinstance(tags, str):
                tags = [tags]

            doc_entry = {
                "path": str(relative_path),
                "title": title,
                "description": description,
                "category": category,
                "tags": tags,
                "audience": "customer"
            }

            docs.append(doc_entry)

        except Exception as e:
            print(f"Warning: Failed to process {relative_path}: {e}")
            continue

    # Sort by path
    docs.sort(key=lambda x: x['path'])

    return docs


def generate_index(repo_path: Path, repo_name: str) -> Dict:
    """
    Generate complete index structure.

    Args:
        repo_path: Path to repository root
        repo_name: Name of repository

    Returns:
        Complete index dict
    """
    docs = scan_markdown_files(repo_path)

    index = {
        "version": "1.0",
        "repo": repo_name,
        "generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
        "doc_count": len(docs),
        "docs": docs
    }

    return index


def main():
    """Main execution function."""
    # Determine repo root (parent of tools/)
    script_dir = Path(__file__).parent
    repo_path = script_dir.parent
    repo_name = repo_path.name

    print(f"Generating index for {repo_name}...")
    print(f"Repository path: {repo_path}")

    # Generate index
    index = generate_index(repo_path, repo_name)

    # Write to .docs-index.json
    index_path = repo_path / '.docs-index.json'
    with open(index_path, 'w', encoding='utf-8') as f:
        json.dump(index, f, indent=2, ensure_ascii=False)

    print(f"Generated index with {index['doc_count']} documents")
    print(f"Written to: {index_path}")


if __name__ == '__main__':
    main()