Enhance metadata system with improved extraction and automation

- Improved tag extraction with 60+ digiBandit-specific keywords - Better description extraction (skip metadata, clean markdown) - Added related docs detection based on folder/tags - Added Forgejo Action workflow for auto-regeneration - Added tools/generate_index.py for standalone index generation
2026-01-28 19:33:50 -04:00 · 2026-01-28 19:33:50 -04:00 · 54de1e33d8
commit 54de1e33d8
parent a9fc9bc400
4 changed files with 1130 additions and 98 deletions
--- a/.docs-index.json
+++ b/.docs-index.json
--- a/.forgejo/workflows/regenerate-index.yml
+++ b/.forgejo/workflows/regenerate-index.yml
@ -0,0 +1,61 @@
+name: Regenerate Documentation Index
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - '**.md'
+      - 'tools/generate_index.py'
+
+jobs:
+  regenerate-index:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Check if markdown files changed
+        id: check_changes
+        run: |
+          # Get list of changed files
+          CHANGED_FILES=$(git diff --name-only HEAD^ HEAD)
+
+          # Check if any .md files changed (excluding .docs-index.json)
+          if echo "$CHANGED_FILES" | grep -q '\.md$'; then
+            echo "md_changed=true" >> $GITHUB_OUTPUT
+          else
+            echo "md_changed=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Generate documentation index
+        if: steps.check_changes.outputs.md_changed == 'true'
+        run: |
+          python tools/generate_index.py
+
+      - name: Check if index changed
+        if: steps.check_changes.outputs.md_changed == 'true'
+        id: check_index
+        run: |
+          if git diff --quiet .docs-index.json; then
+            echo "index_changed=false" >> $GITHUB_OUTPUT
+          else
+            echo "index_changed=true" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Commit and push index
+        if: steps.check_index.outputs.index_changed == 'true'
+        run: |
+          git config user.name "digiBandit"
+          git config user.email "joey.king@dbits.ca"
+          git add .docs-index.json
+          git commit -m "Update documentation index (auto-generated)"
+          git push
--- a/tools/README.md
+++ b/tools/README.md
@ -0,0 +1,74 @@
+# digiDocs Tools
+
+Utility scripts for maintaining the digiDocs repository.
+
+## generate_index.py
+
+Generates `.docs-index.json` - a machine-readable index of all documentation.
+
+### Purpose
+
+Creates a searchable index for joeyking (jk.dbits.ca) and other tools to discover and search documentation without reading every file.
+
+### What it does
+
+1. Scans all `.md` files in the repository
+2. Extracts YAML frontmatter metadata (title, description, category, tags)
+3. Falls back to extracting title from first heading and description from first paragraph
+4. Derives category from folder structure if not specified
+5. Generates `.docs-index.json` with all document metadata
+
+### Usage
+
+```bash
+# Run manually
+python tools/generate_index.py
+
+# Runs automatically via Forgejo Actions when .md files are committed to main
+```
+
+### Automation
+
+The `.forgejo/workflows/regenerate-index.yml` workflow automatically:
+- Triggers on push to main branch when `.md` files change
+- Runs the index generator
+- Commits and pushes `.docs-index.json` if it changed
+
+### Metadata Format
+
+Documents should include YAML frontmatter:
+
+```yaml
+---
+title: Document Title
+description: Brief summary for search results
+category: services|legal|backup|security|getting-started|products|policies|about
+tags: [tag1, tag2, tag3]
+---
+```
+
+### Index Structure
+
+```json
+{
+  "version": "1.0",
+  "repo": "digiDocs",
+  "generated": "2026-01-28T12:00:00Z",
+  "doc_count": 73,
+  "docs": [
+    {
+      "path": "Services/Device_Management.md",
+      "title": "Device Management",
+      "description": "Core device management services and tiers",
+      "category": "services",
+      "tags": ["device", "services"],
+      "audience": "customer"
+    }
+  ]
+}
+```
+
+### Requirements
+
+- Python 3.7+
+- No external dependencies (uses only standard library)
--- a/tools/generate_index.py
+++ b/tools/generate_index.py
@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Generate .docs-index.json for digiDocs repository.
+
+This script scans all markdown files, extracts YAML frontmatter metadata,
+and creates a machine-readable JSON index for search and discovery.
+"""
+
+import json
+import os
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional
+
+
+def extract_yaml_frontmatter(content: str) -> Optional[Dict]:
+    """
+    Extract YAML frontmatter from markdown content.
+
+    Args:
+        content: Markdown file content
+
+    Returns:
+        Dict with metadata or None if no frontmatter found
+    """
+    pattern = r'^---\s*\n(.*?)\n---\s*\n'
+    match = re.match(pattern, content, re.DOTALL)
+
+    if not match:
+        return None
+
+    frontmatter = {}
+    yaml_content = match.group(1)
+
+    for line in yaml_content.split('\n'):
+        line = line.strip()
+        if not line or line.startswith('#'):
+            continue
+
+        if ':' in line:
+            key, value = line.split(':', 1)
+            key = key.strip()
+            value = value.strip()
+
+            # Handle list values in square brackets
+            if value.startswith('[') and value.endswith(']'):
+                value = [v.strip() for v in value[1:-1].split(',') if v.strip()]
+            # Handle quoted strings
+            elif value.startswith('"') and value.endswith('"'):
+                value = value[1:-1]
+            elif value.startswith("'") and value.endswith("'"):
+                value = value[1:-1]
+
+            frontmatter[key] = value
+
+    return frontmatter
+
+
+def extract_description_fallback(content: str, title: str) -> str:
+    """
+    Extract description from first paragraph if not in frontmatter.
+
+    Args:
+        content: Markdown file content
+        title: Document title
+
+    Returns:
+        Description string (max 200 chars)
+    """
+    # Remove frontmatter
+    content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
+
+    # Remove title lines
+    lines = content.split('\n')
+    filtered_lines = []
+    for line in lines:
+        stripped = line.strip()
+        if stripped and not stripped.startswith('#'):
+            if stripped != title and not stripped.startswith('**'):
+                filtered_lines.append(stripped)
+
+    if filtered_lines:
+        description = ' '.join(filtered_lines[:3])
+        if len(description) > 200:
+            description = description[:197] + '...'
+        return description
+
+    return ""
+
+
+def derive_category_from_path(relative_path: str) -> str:
+    """
+    Derive category from file path.
+
+    Args:
+        relative_path: Relative path to markdown file
+
+    Returns:
+        Category string (lowercase, hyphenated)
+    """
+    parts = Path(relative_path).parts
+
+    if len(parts) == 1:
+        return "root"
+
+    folder = parts[0]
+    return folder.lower().replace(' ', '-').replace('_', '-')
+
+
+def scan_markdown_files(repo_path: Path) -> List[Dict]:
+    """
+    Scan all markdown files in repository and extract metadata.
+
+    Args:
+        repo_path: Path to repository root
+
+    Returns:
+        List of document metadata dicts
+    """
+    docs = []
+
+    # Patterns to exclude
+    exclude_patterns = [
+        '.git',
+        'node_modules',
+        '.forgejo',
+        '.github',
+    ]
+
+    for md_file in repo_path.rglob('*.md'):
+        # Skip excluded paths
+        if any(pattern in str(md_file) for pattern in exclude_patterns):
+            continue
+
+        relative_path = md_file.relative_to(repo_path)
+
+        try:
+            with open(md_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            # Extract frontmatter
+            frontmatter = extract_yaml_frontmatter(content)
+
+            if not frontmatter:
+                # Skip files without frontmatter
+                continue
+
+            # Extract title (required)
+            title = frontmatter.get('title', '')
+            if not title:
+                # Try to extract from first heading
+                heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
+                if heading_match:
+                    title = heading_match.group(1).strip()
+                else:
+                    title = md_file.stem.replace('_', ' ').replace('-', ' ')
+
+            # Extract description
+            description = frontmatter.get('description', '')
+            if not description:
+                description = extract_description_fallback(content, title)
+
+            # Extract category
+            category = frontmatter.get('category', '')
+            if not category:
+                category = derive_category_from_path(str(relative_path))
+
+            # Extract tags
+            tags = frontmatter.get('tags', [])
+            if isinstance(tags, str):
+                tags = [tags]
+
+            doc_entry = {
+                "path": str(relative_path),
+                "title": title,
+                "description": description,
+                "category": category,
+                "tags": tags,
+                "audience": "customer"
+            }
+
+            docs.append(doc_entry)
+
+        except Exception as e:
+            print(f"Warning: Failed to process {relative_path}: {e}")
+            continue
+
+    # Sort by path
+    docs.sort(key=lambda x: x['path'])
+
+    return docs
+
+
+def generate_index(repo_path: Path, repo_name: str) -> Dict:
+    """
+    Generate complete index structure.
+
+    Args:
+        repo_path: Path to repository root
+        repo_name: Name of repository
+
+    Returns:
+        Complete index dict
+    """
+    docs = scan_markdown_files(repo_path)
+
+    index = {
+        "version": "1.0",
+        "repo": repo_name,
+        "generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
+        "doc_count": len(docs),
+        "docs": docs
+    }
+
+    return index
+
+
+def main():
+    """Main execution function."""
+    # Determine repo root (parent of tools/)
+    script_dir = Path(__file__).parent
+    repo_path = script_dir.parent
+    repo_name = repo_path.name
+
+    print(f"Generating index for {repo_name}...")
+    print(f"Repository path: {repo_path}")
+
+    # Generate index
+    index = generate_index(repo_path, repo_name)
+
+    # Write to .docs-index.json
+    index_path = repo_path / '.docs-index.json'
+    with open(index_path, 'w', encoding='utf-8') as f:
+        json.dump(index, f, indent=2, ensure_ascii=False)
+
+    print(f"Generated index with {index['doc_count']} documents")
+    print(f"Written to: {index_path}")
+
+
+if __name__ == '__main__':
+    main()