#!/usr/bin/env python3 """ Generate .docs-index.json for digiDocs repository. This script scans all markdown files, extracts YAML frontmatter metadata, and creates a machine-readable JSON index for search and discovery. """ import json import os import re from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional def extract_yaml_frontmatter(content: str) -> Optional[Dict]: """ Extract YAML frontmatter from markdown content. Args: content: Markdown file content Returns: Dict with metadata or None if no frontmatter found """ pattern = r'^---\s*\n(.*?)\n---\s*\n' match = re.match(pattern, content, re.DOTALL) if not match: return None frontmatter = {} yaml_content = match.group(1) for line in yaml_content.split('\n'): line = line.strip() if not line or line.startswith('#'): continue if ':' in line: key, value = line.split(':', 1) key = key.strip() value = value.strip() # Handle list values in square brackets if value.startswith('[') and value.endswith(']'): value = [v.strip() for v in value[1:-1].split(',') if v.strip()] # Handle quoted strings elif value.startswith('"') and value.endswith('"'): value = value[1:-1] elif value.startswith("'") and value.endswith("'"): value = value[1:-1] frontmatter[key] = value return frontmatter def extract_description_fallback(content: str, title: str) -> str: """ Extract description from first paragraph if not in frontmatter. Args: content: Markdown file content title: Document title Returns: Description string (max 200 chars) """ # Remove frontmatter content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL) # Remove title lines lines = content.split('\n') filtered_lines = [] for line in lines: stripped = line.strip() if stripped and not stripped.startswith('#'): if stripped != title and not stripped.startswith('**'): filtered_lines.append(stripped) if filtered_lines: description = ' '.join(filtered_lines[:3]) if len(description) > 200: description = description[:197] + '...' return description return "" def derive_category_from_path(relative_path: str) -> str: """ Derive category from file path. Args: relative_path: Relative path to markdown file Returns: Category string (lowercase, hyphenated) """ parts = Path(relative_path).parts if len(parts) == 1: return "root" folder = parts[0] return folder.lower().replace(' ', '-').replace('_', '-') def scan_markdown_files(repo_path: Path) -> List[Dict]: """ Scan all markdown files in repository and extract metadata. Args: repo_path: Path to repository root Returns: List of document metadata dicts """ docs = [] # Patterns to exclude exclude_patterns = [ '.git', 'node_modules', '.forgejo', '.github', ] for md_file in repo_path.rglob('*.md'): # Skip excluded paths if any(pattern in str(md_file) for pattern in exclude_patterns): continue relative_path = md_file.relative_to(repo_path) try: with open(md_file, 'r', encoding='utf-8') as f: content = f.read() # Extract frontmatter frontmatter = extract_yaml_frontmatter(content) if not frontmatter: # Skip files without frontmatter continue # Extract title (required) title = frontmatter.get('title', '') if not title: # Try to extract from first heading heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) if heading_match: title = heading_match.group(1).strip() else: title = md_file.stem.replace('_', ' ').replace('-', ' ') # Extract description description = frontmatter.get('description', '') if not description: description = extract_description_fallback(content, title) # Extract category category = frontmatter.get('category', '') if not category: category = derive_category_from_path(str(relative_path)) # Extract tags tags = frontmatter.get('tags', []) if isinstance(tags, str): tags = [tags] doc_entry = { "path": str(relative_path), "title": title, "description": description, "category": category, "tags": tags, "audience": "customer" } docs.append(doc_entry) except Exception as e: print(f"Warning: Failed to process {relative_path}: {e}") continue # Sort by path docs.sort(key=lambda x: x['path']) return docs def generate_index(repo_path: Path, repo_name: str) -> Dict: """ Generate complete index structure. Args: repo_path: Path to repository root repo_name: Name of repository Returns: Complete index dict """ docs = scan_markdown_files(repo_path) index = { "version": "1.0", "repo": repo_name, "generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), "doc_count": len(docs), "docs": docs } return index def main(): """Main execution function.""" # Determine repo root (parent of tools/) script_dir = Path(__file__).parent repo_path = script_dir.parent repo_name = repo_path.name print(f"Generating index for {repo_name}...") print(f"Repository path: {repo_path}") # Generate index index = generate_index(repo_path, repo_name) # Write to .docs-index.json index_path = repo_path / '.docs-index.json' with open(index_path, 'w', encoding='utf-8') as f: json.dump(index, f, indent=2, ensure_ascii=False) print(f"Generated index with {index['doc_count']} documents") print(f"Written to: {index_path}") if __name__ == '__main__': main()