Some checks failed
Regenerate Documentation Index / regenerate-index (push) Failing after 7s
- Improved tag extraction with 60+ digiBandit-specific keywords - Better description extraction (skip metadata, clean markdown) - Added related docs detection based on folder/tags - Added Forgejo Action workflow for auto-regeneration - Added tools/generate_index.py for standalone index generation
242 lines
6.4 KiB
Python
Executable file
242 lines
6.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Generate .docs-index.json for digiDocs repository.
|
|
|
|
This script scans all markdown files, extracts YAML frontmatter metadata,
|
|
and creates a machine-readable JSON index for search and discovery.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
|
|
def extract_yaml_frontmatter(content: str) -> Optional[Dict]:
|
|
"""
|
|
Extract YAML frontmatter from markdown content.
|
|
|
|
Args:
|
|
content: Markdown file content
|
|
|
|
Returns:
|
|
Dict with metadata or None if no frontmatter found
|
|
"""
|
|
pattern = r'^---\s*\n(.*?)\n---\s*\n'
|
|
match = re.match(pattern, content, re.DOTALL)
|
|
|
|
if not match:
|
|
return None
|
|
|
|
frontmatter = {}
|
|
yaml_content = match.group(1)
|
|
|
|
for line in yaml_content.split('\n'):
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
|
|
if ':' in line:
|
|
key, value = line.split(':', 1)
|
|
key = key.strip()
|
|
value = value.strip()
|
|
|
|
# Handle list values in square brackets
|
|
if value.startswith('[') and value.endswith(']'):
|
|
value = [v.strip() for v in value[1:-1].split(',') if v.strip()]
|
|
# Handle quoted strings
|
|
elif value.startswith('"') and value.endswith('"'):
|
|
value = value[1:-1]
|
|
elif value.startswith("'") and value.endswith("'"):
|
|
value = value[1:-1]
|
|
|
|
frontmatter[key] = value
|
|
|
|
return frontmatter
|
|
|
|
|
|
def extract_description_fallback(content: str, title: str) -> str:
|
|
"""
|
|
Extract description from first paragraph if not in frontmatter.
|
|
|
|
Args:
|
|
content: Markdown file content
|
|
title: Document title
|
|
|
|
Returns:
|
|
Description string (max 200 chars)
|
|
"""
|
|
# Remove frontmatter
|
|
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
|
|
|
|
# Remove title lines
|
|
lines = content.split('\n')
|
|
filtered_lines = []
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if stripped and not stripped.startswith('#'):
|
|
if stripped != title and not stripped.startswith('**'):
|
|
filtered_lines.append(stripped)
|
|
|
|
if filtered_lines:
|
|
description = ' '.join(filtered_lines[:3])
|
|
if len(description) > 200:
|
|
description = description[:197] + '...'
|
|
return description
|
|
|
|
return ""
|
|
|
|
|
|
def derive_category_from_path(relative_path: str) -> str:
|
|
"""
|
|
Derive category from file path.
|
|
|
|
Args:
|
|
relative_path: Relative path to markdown file
|
|
|
|
Returns:
|
|
Category string (lowercase, hyphenated)
|
|
"""
|
|
parts = Path(relative_path).parts
|
|
|
|
if len(parts) == 1:
|
|
return "root"
|
|
|
|
folder = parts[0]
|
|
return folder.lower().replace(' ', '-').replace('_', '-')
|
|
|
|
|
|
def scan_markdown_files(repo_path: Path) -> List[Dict]:
|
|
"""
|
|
Scan all markdown files in repository and extract metadata.
|
|
|
|
Args:
|
|
repo_path: Path to repository root
|
|
|
|
Returns:
|
|
List of document metadata dicts
|
|
"""
|
|
docs = []
|
|
|
|
# Patterns to exclude
|
|
exclude_patterns = [
|
|
'.git',
|
|
'node_modules',
|
|
'.forgejo',
|
|
'.github',
|
|
]
|
|
|
|
for md_file in repo_path.rglob('*.md'):
|
|
# Skip excluded paths
|
|
if any(pattern in str(md_file) for pattern in exclude_patterns):
|
|
continue
|
|
|
|
relative_path = md_file.relative_to(repo_path)
|
|
|
|
try:
|
|
with open(md_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Extract frontmatter
|
|
frontmatter = extract_yaml_frontmatter(content)
|
|
|
|
if not frontmatter:
|
|
# Skip files without frontmatter
|
|
continue
|
|
|
|
# Extract title (required)
|
|
title = frontmatter.get('title', '')
|
|
if not title:
|
|
# Try to extract from first heading
|
|
heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
|
if heading_match:
|
|
title = heading_match.group(1).strip()
|
|
else:
|
|
title = md_file.stem.replace('_', ' ').replace('-', ' ')
|
|
|
|
# Extract description
|
|
description = frontmatter.get('description', '')
|
|
if not description:
|
|
description = extract_description_fallback(content, title)
|
|
|
|
# Extract category
|
|
category = frontmatter.get('category', '')
|
|
if not category:
|
|
category = derive_category_from_path(str(relative_path))
|
|
|
|
# Extract tags
|
|
tags = frontmatter.get('tags', [])
|
|
if isinstance(tags, str):
|
|
tags = [tags]
|
|
|
|
doc_entry = {
|
|
"path": str(relative_path),
|
|
"title": title,
|
|
"description": description,
|
|
"category": category,
|
|
"tags": tags,
|
|
"audience": "customer"
|
|
}
|
|
|
|
docs.append(doc_entry)
|
|
|
|
except Exception as e:
|
|
print(f"Warning: Failed to process {relative_path}: {e}")
|
|
continue
|
|
|
|
# Sort by path
|
|
docs.sort(key=lambda x: x['path'])
|
|
|
|
return docs
|
|
|
|
|
|
def generate_index(repo_path: Path, repo_name: str) -> Dict:
|
|
"""
|
|
Generate complete index structure.
|
|
|
|
Args:
|
|
repo_path: Path to repository root
|
|
repo_name: Name of repository
|
|
|
|
Returns:
|
|
Complete index dict
|
|
"""
|
|
docs = scan_markdown_files(repo_path)
|
|
|
|
index = {
|
|
"version": "1.0",
|
|
"repo": repo_name,
|
|
"generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
|
|
"doc_count": len(docs),
|
|
"docs": docs
|
|
}
|
|
|
|
return index
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
# Determine repo root (parent of tools/)
|
|
script_dir = Path(__file__).parent
|
|
repo_path = script_dir.parent
|
|
repo_name = repo_path.name
|
|
|
|
print(f"Generating index for {repo_name}...")
|
|
print(f"Repository path: {repo_path}")
|
|
|
|
# Generate index
|
|
index = generate_index(repo_path, repo_name)
|
|
|
|
# Write to .docs-index.json
|
|
index_path = repo_path / '.docs-index.json'
|
|
with open(index_path, 'w', encoding='utf-8') as f:
|
|
json.dump(index, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Generated index with {index['doc_count']} documents")
|
|
print(f"Written to: {index_path}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|