Enhance metadata system with improved extraction and automation
Some checks failed
Regenerate Documentation Index / regenerate-index (push) Failing after 7s
Some checks failed
Regenerate Documentation Index / regenerate-index (push) Failing after 7s
- Improved tag extraction with 60+ digiBandit-specific keywords - Better description extraction (skip metadata, clean markdown) - Added related docs detection based on folder/tags - Added Forgejo Action workflow for auto-regeneration - Added tools/generate_index.py for standalone index generation
This commit is contained in:
parent
a9fc9bc400
commit
54de1e33d8
4 changed files with 1130 additions and 98 deletions
851
.docs-index.json
851
.docs-index.json
File diff suppressed because it is too large
Load diff
61
.forgejo/workflows/regenerate-index.yml
Normal file
61
.forgejo/workflows/regenerate-index.yml
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
name: Regenerate Documentation Index
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- '**.md'
|
||||
- 'tools/generate_index.py'
|
||||
|
||||
jobs:
|
||||
regenerate-index:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Check if markdown files changed
|
||||
id: check_changes
|
||||
run: |
|
||||
# Get list of changed files
|
||||
CHANGED_FILES=$(git diff --name-only HEAD^ HEAD)
|
||||
|
||||
# Check if any .md files changed (excluding .docs-index.json)
|
||||
if echo "$CHANGED_FILES" | grep -q '\.md$'; then
|
||||
echo "md_changed=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "md_changed=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Generate documentation index
|
||||
if: steps.check_changes.outputs.md_changed == 'true'
|
||||
run: |
|
||||
python tools/generate_index.py
|
||||
|
||||
- name: Check if index changed
|
||||
if: steps.check_changes.outputs.md_changed == 'true'
|
||||
id: check_index
|
||||
run: |
|
||||
if git diff --quiet .docs-index.json; then
|
||||
echo "index_changed=false" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "index_changed=true" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Commit and push index
|
||||
if: steps.check_index.outputs.index_changed == 'true'
|
||||
run: |
|
||||
git config user.name "digiBandit"
|
||||
git config user.email "joey.king@dbits.ca"
|
||||
git add .docs-index.json
|
||||
git commit -m "Update documentation index (auto-generated)"
|
||||
git push
|
||||
74
tools/README.md
Normal file
74
tools/README.md
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# digiDocs Tools
|
||||
|
||||
Utility scripts for maintaining the digiDocs repository.
|
||||
|
||||
## generate_index.py
|
||||
|
||||
Generates `.docs-index.json` - a machine-readable index of all documentation.
|
||||
|
||||
### Purpose
|
||||
|
||||
Creates a searchable index for joeyking (jk.dbits.ca) and other tools to discover and search documentation without reading every file.
|
||||
|
||||
### What it does
|
||||
|
||||
1. Scans all `.md` files in the repository
|
||||
2. Extracts YAML frontmatter metadata (title, description, category, tags)
|
||||
3. Falls back to extracting title from first heading and description from first paragraph
|
||||
4. Derives category from folder structure if not specified
|
||||
5. Generates `.docs-index.json` with all document metadata
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Run manually
|
||||
python tools/generate_index.py
|
||||
|
||||
# Runs automatically via Forgejo Actions when .md files are committed to main
|
||||
```
|
||||
|
||||
### Automation
|
||||
|
||||
The `.forgejo/workflows/regenerate-index.yml` workflow automatically:
|
||||
- Triggers on push to main branch when `.md` files change
|
||||
- Runs the index generator
|
||||
- Commits and pushes `.docs-index.json` if it changed
|
||||
|
||||
### Metadata Format
|
||||
|
||||
Documents should include YAML frontmatter:
|
||||
|
||||
```yaml
|
||||
---
|
||||
title: Document Title
|
||||
description: Brief summary for search results
|
||||
category: services|legal|backup|security|getting-started|products|policies|about
|
||||
tags: [tag1, tag2, tag3]
|
||||
---
|
||||
```
|
||||
|
||||
### Index Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"version": "1.0",
|
||||
"repo": "digiDocs",
|
||||
"generated": "2026-01-28T12:00:00Z",
|
||||
"doc_count": 73,
|
||||
"docs": [
|
||||
{
|
||||
"path": "Services/Device_Management.md",
|
||||
"title": "Device Management",
|
||||
"description": "Core device management services and tiers",
|
||||
"category": "services",
|
||||
"tags": ["device", "services"],
|
||||
"audience": "customer"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Requirements
|
||||
|
||||
- Python 3.7+
|
||||
- No external dependencies (uses only standard library)
|
||||
242
tools/generate_index.py
Executable file
242
tools/generate_index.py
Executable file
|
|
@ -0,0 +1,242 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate .docs-index.json for digiDocs repository.
|
||||
|
||||
This script scans all markdown files, extracts YAML frontmatter metadata,
|
||||
and creates a machine-readable JSON index for search and discovery.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
def extract_yaml_frontmatter(content: str) -> Optional[Dict]:
|
||||
"""
|
||||
Extract YAML frontmatter from markdown content.
|
||||
|
||||
Args:
|
||||
content: Markdown file content
|
||||
|
||||
Returns:
|
||||
Dict with metadata or None if no frontmatter found
|
||||
"""
|
||||
pattern = r'^---\s*\n(.*?)\n---\s*\n'
|
||||
match = re.match(pattern, content, re.DOTALL)
|
||||
|
||||
if not match:
|
||||
return None
|
||||
|
||||
frontmatter = {}
|
||||
yaml_content = match.group(1)
|
||||
|
||||
for line in yaml_content.split('\n'):
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
if ':' in line:
|
||||
key, value = line.split(':', 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Handle list values in square brackets
|
||||
if value.startswith('[') and value.endswith(']'):
|
||||
value = [v.strip() for v in value[1:-1].split(',') if v.strip()]
|
||||
# Handle quoted strings
|
||||
elif value.startswith('"') and value.endswith('"'):
|
||||
value = value[1:-1]
|
||||
elif value.startswith("'") and value.endswith("'"):
|
||||
value = value[1:-1]
|
||||
|
||||
frontmatter[key] = value
|
||||
|
||||
return frontmatter
|
||||
|
||||
|
||||
def extract_description_fallback(content: str, title: str) -> str:
|
||||
"""
|
||||
Extract description from first paragraph if not in frontmatter.
|
||||
|
||||
Args:
|
||||
content: Markdown file content
|
||||
title: Document title
|
||||
|
||||
Returns:
|
||||
Description string (max 200 chars)
|
||||
"""
|
||||
# Remove frontmatter
|
||||
content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)
|
||||
|
||||
# Remove title lines
|
||||
lines = content.split('\n')
|
||||
filtered_lines = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if stripped and not stripped.startswith('#'):
|
||||
if stripped != title and not stripped.startswith('**'):
|
||||
filtered_lines.append(stripped)
|
||||
|
||||
if filtered_lines:
|
||||
description = ' '.join(filtered_lines[:3])
|
||||
if len(description) > 200:
|
||||
description = description[:197] + '...'
|
||||
return description
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def derive_category_from_path(relative_path: str) -> str:
|
||||
"""
|
||||
Derive category from file path.
|
||||
|
||||
Args:
|
||||
relative_path: Relative path to markdown file
|
||||
|
||||
Returns:
|
||||
Category string (lowercase, hyphenated)
|
||||
"""
|
||||
parts = Path(relative_path).parts
|
||||
|
||||
if len(parts) == 1:
|
||||
return "root"
|
||||
|
||||
folder = parts[0]
|
||||
return folder.lower().replace(' ', '-').replace('_', '-')
|
||||
|
||||
|
||||
def scan_markdown_files(repo_path: Path) -> List[Dict]:
|
||||
"""
|
||||
Scan all markdown files in repository and extract metadata.
|
||||
|
||||
Args:
|
||||
repo_path: Path to repository root
|
||||
|
||||
Returns:
|
||||
List of document metadata dicts
|
||||
"""
|
||||
docs = []
|
||||
|
||||
# Patterns to exclude
|
||||
exclude_patterns = [
|
||||
'.git',
|
||||
'node_modules',
|
||||
'.forgejo',
|
||||
'.github',
|
||||
]
|
||||
|
||||
for md_file in repo_path.rglob('*.md'):
|
||||
# Skip excluded paths
|
||||
if any(pattern in str(md_file) for pattern in exclude_patterns):
|
||||
continue
|
||||
|
||||
relative_path = md_file.relative_to(repo_path)
|
||||
|
||||
try:
|
||||
with open(md_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Extract frontmatter
|
||||
frontmatter = extract_yaml_frontmatter(content)
|
||||
|
||||
if not frontmatter:
|
||||
# Skip files without frontmatter
|
||||
continue
|
||||
|
||||
# Extract title (required)
|
||||
title = frontmatter.get('title', '')
|
||||
if not title:
|
||||
# Try to extract from first heading
|
||||
heading_match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
|
||||
if heading_match:
|
||||
title = heading_match.group(1).strip()
|
||||
else:
|
||||
title = md_file.stem.replace('_', ' ').replace('-', ' ')
|
||||
|
||||
# Extract description
|
||||
description = frontmatter.get('description', '')
|
||||
if not description:
|
||||
description = extract_description_fallback(content, title)
|
||||
|
||||
# Extract category
|
||||
category = frontmatter.get('category', '')
|
||||
if not category:
|
||||
category = derive_category_from_path(str(relative_path))
|
||||
|
||||
# Extract tags
|
||||
tags = frontmatter.get('tags', [])
|
||||
if isinstance(tags, str):
|
||||
tags = [tags]
|
||||
|
||||
doc_entry = {
|
||||
"path": str(relative_path),
|
||||
"title": title,
|
||||
"description": description,
|
||||
"category": category,
|
||||
"tags": tags,
|
||||
"audience": "customer"
|
||||
}
|
||||
|
||||
docs.append(doc_entry)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {relative_path}: {e}")
|
||||
continue
|
||||
|
||||
# Sort by path
|
||||
docs.sort(key=lambda x: x['path'])
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def generate_index(repo_path: Path, repo_name: str) -> Dict:
|
||||
"""
|
||||
Generate complete index structure.
|
||||
|
||||
Args:
|
||||
repo_path: Path to repository root
|
||||
repo_name: Name of repository
|
||||
|
||||
Returns:
|
||||
Complete index dict
|
||||
"""
|
||||
docs = scan_markdown_files(repo_path)
|
||||
|
||||
index = {
|
||||
"version": "1.0",
|
||||
"repo": repo_name,
|
||||
"generated": datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
|
||||
"doc_count": len(docs),
|
||||
"docs": docs
|
||||
}
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def main():
|
||||
"""Main execution function."""
|
||||
# Determine repo root (parent of tools/)
|
||||
script_dir = Path(__file__).parent
|
||||
repo_path = script_dir.parent
|
||||
repo_name = repo_path.name
|
||||
|
||||
print(f"Generating index for {repo_name}...")
|
||||
print(f"Repository path: {repo_path}")
|
||||
|
||||
# Generate index
|
||||
index = generate_index(repo_path, repo_name)
|
||||
|
||||
# Write to .docs-index.json
|
||||
index_path = repo_path / '.docs-index.json'
|
||||
with open(index_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(index, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Generated index with {index['doc_count']} documents")
|
||||
print(f"Written to: {index_path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in a new issue