Enhance metadata system with improved extraction and automation

- Improved tag extraction with 60+ digiBandit-specific keywords - Better description extraction (skip metadata, clean markdown) - Added related docs detection based on folder/tags - Added Forgejo Action workflow for auto-regeneration - Added tools/generate_index.py for standalone index generation
2026-01-28 19:33:50 -04:00 · 2026-01-28 19:33:50 -04:00 · 54de1e33d8
commit 54de1e33d8
parent a9fc9bc400
4 changed files with 1130 additions and 98 deletions
--- a/.docs-index.json
+++ b/.docs-index.json
--- a/.forgejo/workflows/regenerate-index.yml
+++ b/.forgejo/workflows/regenerate-index.yml
@ -0,0 +1,61 @@
 name: Regenerate Documentation Index
 on:
  push:
    branches:
      - main
    paths:
      - '**.md'
      - 'tools/generate_index.py'
 jobs:
  regenerate-index:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 2
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Check if markdown files changed
        id: check_changes
        run: |
          # Get list of changed files
          CHANGED_FILES=$(git diff --name-only HEAD^ HEAD)
          # Check if any .md files changed (excluding .docs-index.json)
          if echo "$CHANGED_FILES" | grep -q '\.md$'; then
            echo "md_changed=true" >> $GITHUB_OUTPUT
          else
            echo "md_changed=false" >> $GITHUB_OUTPUT
          fi
      - name: Generate documentation index
        if: steps.check_changes.outputs.md_changed == 'true'
        run: |
          python tools/generate_index.py
      - name: Check if index changed
        if: steps.check_changes.outputs.md_changed == 'true'
        id: check_index
        run: |
          if git diff --quiet .docs-index.json; then
            echo "index_changed=false" >> $GITHUB_OUTPUT
          else
            echo "index_changed=true" >> $GITHUB_OUTPUT
          fi
      - name: Commit and push index
        if: steps.check_index.outputs.index_changed == 'true'
        run: |
          git config user.name "digiBandit"
          git config user.email "joey.king@dbits.ca"
          git add .docs-index.json
          git commit -m "Update documentation index (auto-generated)"
          git push
--- a/tools/README.md
+++ b/tools/README.md
@ -0,0 +1,74 @@
 # digiDocs Tools
 Utility scripts for maintaining the digiDocs repository.
 ## generate_index.py
 Generates `.docs-index.json` - a machine-readable index of all documentation.
 ### Purpose
 Creates a searchable index for joeyking (jk.dbits.ca) and other tools to discover and search documentation without reading every file.
 ### What it does
 1. Scans all `.md` files in the repository
 2. Extracts YAML frontmatter metadata (title, description, category, tags)
 3. Falls back to extracting title from first heading and description from first paragraph
 4. Derives category from folder structure if not specified
 5. Generates `.docs-index.json` with all document metadata
 ### Usage
 ```bash
 # Run manually
 python tools/generate_index.py
 # Runs automatically via Forgejo Actions when .md files are committed to main
 ```
 ### Automation
 The `.forgejo/workflows/regenerate-index.yml` workflow automatically:
 - Triggers on push to main branch when `.md` files change
 - Runs the index generator
 - Commits and pushes `.docs-index.json` if it changed
 ### Metadata Format
 Documents should include YAML frontmatter:
 ```yaml
 ---
 title: Document Title
 description: Brief summary for search results
 category: services|legal|backup|security|getting-started|products|policies|about
 tags: [tag1, tag2, tag3]
 ---
 ```
 ### Index Structure
 ```json
 {
  "version": "1.0",
  "repo": "digiDocs",
  "generated": "2026-01-28T12:00:00Z",
  "doc_count": 73,
  "docs": [
    {
      "path": "Services/Device_Management.md",
      "title": "Device Management",
      "description": "Core device management services and tiers",
      "category": "services",
      "tags": ["device", "services"],
      "audience": "customer"
    }
  ]
 }
 ```
 ### Requirements
 - Python 3.7+
 - No external dependencies (uses only standard library)
--- a/tools/generate_index.py
+++ b/tools/generate_index.py
@ -0,0 +1,242 @@
 #!/usr/bin/env python3
 """
 Generate .docs-index.json for digiDocs repository.
 This script scans all markdown files, extracts YAML frontmatter metadata,
 and creates a machine-readable JSON index for search and discovery.
 """
 import json
 import os
 import re
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List, Optional
 def extract_yaml_frontmatter(content: str) -> Optional[Dict]:
    """
    Extract YAML frontmatter from markdown content.
    Args:
        content: Markdown file content
    Returns:
        Dict with metadata or None if no frontmatter found
    """
    pattern = r'^---\s*\n(.*?)\n---\s*\n'
    match = re.match(pattern, content, re.DOTALL)
    if not match:
        return None
    frontmatter = {}
    yaml_content = match.group(1)
    for line in yaml_content.split('\n'):
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip()
            # Handle list values in square brackets
            if value.startswith('[') and value.endswith(']'):
                value = [v.strip() for v in value[1:-1].split(',') if v.strip()]
            # Handle quoted strings
            elif value.startswith('"') and value.endswith('"'):
                value = value[1:-1]
            elif value.startswith("'") and value.endswith("'"):
                value = value[1:-1]
            frontmatter[key] = value
    return frontmatter
 def extract_description_fallback(content: str, title: str) -> str:
    """
    Extract description from first paragraph if not in frontmatter.
    Args:
        content: Markdown file content
        title: Document title
    Returns:
        Description string (max 200 chars)
    """
    # Remove frontmatter
    content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
    # Remove title lines
    lines = content.split('\n')
    filtered_lines = []
    for line in lines:
        stripped = line.strip()
        if stripped and not stripped.startswith('#'):
            if stripped != title and not stripped.startswith('**'):
                filtered_lines.append(stripped)
    if filtered_lines:
        description = ' '.join(filtered_lines[:3])
        if len(description) > 200:
            description = description[:197] + '...'
        return description
    return ""
 def derive_category_from_path(relative_path: str) -> str:
    """
    Derive category from file path.
    Args:
        relative_path: Relative path to markdown file
    Returns:
        Category string (lowercase, hyphenated)
    """
    parts = Path(relative_path).parts
    if len(parts) == 1:
        return "root"
    folder = parts[0]
    return folder.lower().replace(' ', '-').replace('_', '-')
 def scan_markdown_files(repo_path: Path) -> List[Dict]:
    """
    Scan all markdown files in repository and extract metadata.
    Args:
        repo_path: Path to repository root
    Returns:
        List of document metadata dicts
    """
    docs = []
    # Patterns to exclude
    exclude_patterns = [
        '.git',
        'node_modules',
        '.forgejo',
        '.github',
    ]
    for md_file in repo_path.rglob('*.md'):
        # Skip excluded paths
        if any(pattern in str(md_file) for pattern in exclude_patterns):
            continue
        relative_path = md_file.relative_to(repo_path)
        try:
            with open(md_file, 'r', encoding='utf-8') as f:
                content = f.read()
            # Extract frontmatter
            frontmatter = extract_yaml_frontmatter(content)
            if not frontmatter:
                # Skip files without frontmatter
                continue
            # Extract title (required)
            title = frontmatter.get('title', '')
            if not title:
                # Try to extract from first heading
                heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
                if heading_match:
                    title = heading_match.group(1).strip()
                else:
                    title = md_file.stem.replace('_', ' ').replace('-', ' ')
            # Extract description
            description = frontmatter.get('description', '')
            if not description:
                description = extract_description_fallback(content, title)
            # Extract category
            category = frontmatter.get('category', '')
            if not category:
                category = derive_category_from_path(str(relative_path))
            # Extract tags
            tags = frontmatter.get('tags', [])
            if isinstance(tags, str):
                tags = [tags]
            doc_entry = {
                "path": str(relative_path),
                "title": title,
                "description": description,
                "category": category,
                "tags": tags,
                "audience": "customer"
            }
            docs.append(doc_entry)
        except Exception as e:
            print(f"Warning: Failed to process {relative_path}: {e}")
            continue
    # Sort by path
    docs.sort(key=lambda x: x['path'])
    return docs
 def generate_index(repo_path: Path, repo_name: str) -> Dict:
    """
    Generate complete index structure.
    Args:
        repo_path: Path to repository root
        repo_name: Name of repository
    Returns:
        Complete index dict
    """
    docs = scan_markdown_files(repo_path)
    index = {
        "version": "1.0",
        "repo": repo_name,
        "generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
        "doc_count": len(docs),
        "docs": docs
    }
    return index
 def main():
    """Main execution function."""
    # Determine repo root (parent of tools/)
    script_dir = Path(__file__).parent
    repo_path = script_dir.parent
    repo_name = repo_path.name
    print(f"Generating index for {repo_name}...")
    print(f"Repository path: {repo_path}")
    # Generate index
    index = generate_index(repo_path, repo_name)
    # Write to .docs-index.json
    index_path = repo_path / '.docs-index.json'
    with open(index_path, 'w', encoding='utf-8') as f:
        json.dump(index, f, indent=2, ensure_ascii=False)
    print(f"Generated index with {index['doc_count']} documents")
    print(f"Written to: {index_path}")
 if __name__ == '__main__':
    main()