digiDocs/tools/generate_index.py
dbits-db 54de1e33d8
Some checks failed
Regenerate Documentation Index / regenerate-index (push) Failing after 7s
Enhance metadata system with improved extraction and automation
- Improved tag extraction with 60+ digiBandit-specific keywords
- Better description extraction (skip metadata, clean markdown)
- Added related docs detection based on folder/tags
- Added Forgejo Action workflow for auto-regeneration
- Added tools/generate_index.py for standalone index generation
2026-01-28 19:33:50 -04:00

242 lines
6.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Generate .docs-index.json for digiDocs repository.
This script scans all markdown files, extracts YAML frontmatter metadata,
and creates a machine-readable JSON index for search and discovery.
"""
import json
import os
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
def extract_yaml_frontmatter(content: str) -> Optional[Dict]:
"""
Extract YAML frontmatter from markdown content.
Args:
content: Markdown file content
Returns:
Dict with metadata or None if no frontmatter found
"""
pattern = r'^---\s*\n(.*?)\n---\s*\n'
match = re.match(pattern, content, re.DOTALL)
if not match:
return None
frontmatter = {}
yaml_content = match.group(1)
for line in yaml_content.split('\n'):
line = line.strip()
if not line or line.startswith('#'):
continue
if ':' in line:
key, value = line.split(':', 1)
key = key.strip()
value = value.strip()
# Handle list values in square brackets
if value.startswith('[') and value.endswith(']'):
value = [v.strip() for v in value[1:-1].split(',') if v.strip()]
# Handle quoted strings
elif value.startswith('"') and value.endswith('"'):
value = value[1:-1]
elif value.startswith("'") and value.endswith("'"):
value = value[1:-1]
frontmatter[key] = value
return frontmatter
def extract_description_fallback(content: str, title: str) -> str:
"""
Extract description from first paragraph if not in frontmatter.
Args:
content: Markdown file content
title: Document title
Returns:
Description string (max 200 chars)
"""
# Remove frontmatter
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
# Remove title lines
lines = content.split('\n')
filtered_lines = []
for line in lines:
stripped = line.strip()
if stripped and not stripped.startswith('#'):
if stripped != title and not stripped.startswith('**'):
filtered_lines.append(stripped)
if filtered_lines:
description = ' '.join(filtered_lines[:3])
if len(description) > 200:
description = description[:197] + '...'
return description
return ""
def derive_category_from_path(relative_path: str) -> str:
"""
Derive category from file path.
Args:
relative_path: Relative path to markdown file
Returns:
Category string (lowercase, hyphenated)
"""
parts = Path(relative_path).parts
if len(parts) == 1:
return "root"
folder = parts[0]
return folder.lower().replace(' ', '-').replace('_', '-')
def scan_markdown_files(repo_path: Path) -> List[Dict]:
"""
Scan all markdown files in repository and extract metadata.
Args:
repo_path: Path to repository root
Returns:
List of document metadata dicts
"""
docs = []
# Patterns to exclude
exclude_patterns = [
'.git',
'node_modules',
'.forgejo',
'.github',
]
for md_file in repo_path.rglob('*.md'):
# Skip excluded paths
if any(pattern in str(md_file) for pattern in exclude_patterns):
continue
relative_path = md_file.relative_to(repo_path)
try:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# Extract frontmatter
frontmatter = extract_yaml_frontmatter(content)
if not frontmatter:
# Skip files without frontmatter
continue
# Extract title (required)
title = frontmatter.get('title', '')
if not title:
# Try to extract from first heading
heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
if heading_match:
title = heading_match.group(1).strip()
else:
title = md_file.stem.replace('_', ' ').replace('-', ' ')
# Extract description
description = frontmatter.get('description', '')
if not description:
description = extract_description_fallback(content, title)
# Extract category
category = frontmatter.get('category', '')
if not category:
category = derive_category_from_path(str(relative_path))
# Extract tags
tags = frontmatter.get('tags', [])
if isinstance(tags, str):
tags = [tags]
doc_entry = {
"path": str(relative_path),
"title": title,
"description": description,
"category": category,
"tags": tags,
"audience": "customer"
}
docs.append(doc_entry)
except Exception as e:
print(f"Warning: Failed to process {relative_path}: {e}")
continue
# Sort by path
docs.sort(key=lambda x: x['path'])
return docs
def generate_index(repo_path: Path, repo_name: str) -> Dict:
"""
Generate complete index structure.
Args:
repo_path: Path to repository root
repo_name: Name of repository
Returns:
Complete index dict
"""
docs = scan_markdown_files(repo_path)
index = {
"version": "1.0",
"repo": repo_name,
"generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
"doc_count": len(docs),
"docs": docs
}
return index
def main():
"""Main execution function."""
# Determine repo root (parent of tools/)
script_dir = Path(__file__).parent
repo_path = script_dir.parent
repo_name = repo_path.name
print(f"Generating index for {repo_name}...")
print(f"Repository path: {repo_path}")
# Generate index
index = generate_index(repo_path, repo_name)
# Write to .docs-index.json
index_path = repo_path / '.docs-index.json'
with open(index_path, 'w', encoding='utf-8') as f:
json.dump(index, f, indent=2, ensure_ascii=False)
print(f"Generated index with {index['doc_count']} documents")
print(f"Written to: {index_path}")
if __name__ == '__main__':
main()