Refactor SEO description script and update workflow

Improved modularity and readability of add_seo_descriptions.py by refactoring functions, enhancing block handling, and adding statistics saving. Updated auto-add-seo.yml to correctly detect changed markdown files using PR head and base SHAs for more accurate file selection.
4 months ago · da2d2924ba
2 changed files with 167 additions and 128 deletions
--- a/.github/scripts/add_seo_descriptions.py
+++ b/.github/scripts/add_seo_descriptions.py
@ -1,22 +1,22 @@
 import os
 import sys
 import re
 import json
 from openai import OpenAI
 client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
 # Regex patterns as constants
 SEO_BLOCK_PATTERN = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+'
 SEO_BLOCK_WITH_BACKTICKS_PATTERN = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1'
 def has_seo_description(content):
    """Check if content already has SEO description with Description field"""
-    import json
+    match = re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL)
    # Match SEO description block with 3 or more backticks
    pattern = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+'
    match = re.search(pattern, content, flags=re.DOTALL)
    if not match:
        return False
    # Check if Description field exists and is not empty
    try:
        json_str = match.group(1)
        seo_data = json.loads(json_str)
@ -24,24 +24,42 @@ def has_seo_description(content):
    except json.JSONDecodeError:
        return False
-def is_content_too_short(content):
+def has_seo_block(content):
-    """Check if content is less than 200 characters"""
+    """Check if content has any SEO block (with or without Description)"""
-    # Remove SEO tags if present for accurate count
+    return bool(re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL))
-    # Match SEO description block with 3 or more backticks
+
-    clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL)
+def remove_seo_blocks(content):
-    
+    """Remove all SEO description blocks from content"""
-    return len(clean_content.strip()) < 200
+    return re.sub(SEO_BLOCK_PATTERN + r'\s*', '', content, flags=re.DOTALL)
 def is_content_too_short(content, min_length=200):
    """Check if content is less than minimum length (excluding SEO blocks)"""
    clean_content = remove_seo_blocks(content)
    return len(clean_content.strip()) < min_length
 def get_content_preview(content, max_length=1000):
-    """Get preview of content for OpenAI"""
+    """Get preview of content for OpenAI (excluding SEO blocks)"""
-    # Remove existing SEO tags if present
+    clean_content = remove_seo_blocks(content)
    # Match SEO description block with 3 or more backticks
    clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL)
    return clean_content[:max_length].strip()
 def escape_json_string(text):
    """Escape special characters for JSON"""
    return text.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
 def create_seo_block(description):
    """Create a new SEO block with the given description"""
    escaped_desc = escape_json_string(description)
    return f'''```json
 //[doc-seo]
 {{
    "Description": "{escaped_desc}"
 }}
 ```
 '''
 def generate_description(content, filename):
-    """Generate SEO description using OpenAI with system prompt from OpenAIService.cs"""
+    """Generate SEO description using OpenAI"""
    try:
        preview = get_content_preview(content)
@ -69,154 +87,169 @@ Generate only the description text, nothing else:"""}
        )
        description = response.choices[0].message.content.strip()
        return description
    except Exception as e:
        print(f"❌ Error generating description: {e}")
        return f"Learn about {os.path.splitext(filename)[0]} in ABP Framework documentation."
-def add_seo_description(content, description):
+def update_seo_description(content, description):
-    """Add or update SEO description in content"""
+    """Update existing SEO block with new description"""
-    import json
+    match = re.search(SEO_BLOCK_WITH_BACKTICKS_PATTERN, content, flags=re.DOTALL)
-    # Escape special characters for JSON
+    if not match:
-    escaped_desc = description.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
+        return None
-    # Check if SEO block already exists
+    backticks = match.group(1)
-    pattern = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1'
+    json_str = match.group(2)
    match = re.search(pattern, content, flags=re.DOTALL)
-    if match:
+    try:
-        # SEO block exists, update Description field
+        seo_data = json.loads(json_str)
-        backticks = match.group(1)
+        seo_data['Description'] = description
-        json_str = match.group(2)
+        updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False)
-        try:
+        new_block = f'''{backticks}json
            # Parse existing JSON
            seo_data = json.loads(json_str)
            # Update Description
            seo_data['Description'] = description
            # Convert back to formatted JSON
            updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False)
            # Replace the old block with updated one
            new_block = f'''{backticks}json
 //[doc-seo]
 {updated_json}
 {backticks}'''
-            
+        
-            return re.sub(pattern, new_block, content, count=1, flags=re.DOTALL)
+        return re.sub(SEO_BLOCK_WITH_BACKTICKS_PATTERN, new_block, content, count=1, flags=re.DOTALL)
-        except json.JSONDecodeError:
+    except json.JSONDecodeError:
-            # If JSON is invalid, replace the whole block
+        return None
            pass
    # No existing block or invalid JSON, add new block at the beginning
    seo_tag = f'''```json
 //[doc-seo]
 {{
    "Description": "{escaped_desc}"
 }}
 ```
-'''
+def add_seo_description(content, description):
-    return seo_tag + content
+    """Add or update SEO description in content"""
    # Try to update existing block first
    updated_content = update_seo_description(content, description)
    if updated_content:
        return updated_content
    # No existing block or update failed, add new block at the beginning
    return create_seo_block(description) + content
 def is_file_ignored(filepath, ignored_folders):
    """Check if file is in an ignored folder"""
    path_parts = filepath.split('/')
-    for ignored in ignored_folders:
+    return any(ignored in path_parts for ignored in ignored_folders)
-        if ignored in path_parts:
+
-            return True
+def get_changed_files():
-    return False
+    """Get changed files from command line or environment variable"""
    if len(sys.argv) > 1:
        return sys.argv[1:]
    changed_files_str = os.environ.get('CHANGED_FILES', '')
    return [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()]
 def process_file(filepath, ignored_folders):
    """Process a single markdown file. Returns (processed, skipped, skip_reason)"""
    if not filepath.endswith('.md'):
        return False, False, None
    # Check if file is in ignored folder
    if is_file_ignored(filepath, ignored_folders):
        print(f"📄 Processing: {filepath}")
        print(f"   🚫 Skipped (ignored folder)\n")
        return False, True, 'ignored'
    print(f"📄 Processing: {filepath}")
    try:
        # Read file with original line endings
        with open(filepath, 'r', encoding='utf-8', newline='') as f:
            content = f.read()
        # Check if content is too short
        if is_content_too_short(content):
            print(f"   ⏭️  Skipped (content less than 200 characters)\n")
            return False, True, 'too_short'
        # Check if already has SEO description
        if has_seo_description(content):
            print(f"   ⏭️  Skipped (already has SEO description)\n")
            return False, True, 'has_description'
        # Generate description
        filename = os.path.basename(filepath)
        print(f"   🤖 Generating description...")
        description = generate_description(content, filename)
        print(f"   💡 Generated: {description}")
        # Add or update SEO description
        if has_seo_block(content):
            print(f"   🔄 Updating existing SEO block...")
        else:
            print(f"   ➕ Adding new SEO block...")
        updated_content = add_seo_description(content, description)
        # Write back (preserving line endings)
        with open(filepath, 'w', encoding='utf-8', newline='') as f:
            f.write(updated_content)
        print(f"   ✅ Updated successfully\n")
        return True, False, None
    except Exception as e:
        print(f"   ❌ Error: {e}\n")
        return False, False, None
 def save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored):
    """Save processing statistics to file"""
    try:
        with open('/tmp/seo_stats.txt', 'w') as f:
            f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}")
    except Exception as e:
        print(f"⚠️  Warning: Could not save statistics: {e}")
 def save_updated_files(updated_files):
    """Save list of updated files"""
    try:
        with open('/tmp/seo_updated_files.txt', 'w') as f:
            f.write('\n'.join(updated_files))
    except Exception as e:
        print(f"⚠️  Warning: Could not save updated files list: {e}")
 def main():
-    # Ignored folders from GitHub variable (or default values)
+    # Get ignored folders from environment
    IGNORED_FOLDERS_STR = os.environ.get('IGNORED_FOLDERS', 'Blog-Posts,Community-Articles,_deleted,_resources')
    IGNORED_FOLDERS = [folder.strip() for folder in IGNORED_FOLDERS_STR.split(',') if folder.strip()]
-    # Get changed files from environment or command line
+    # Get changed files
-    if len(sys.argv) > 1:
+    changed_files = get_changed_files()
        # Files passed as command line arguments
        changed_files = sys.argv[1:]
    else:
        # Files from environment variable (for GitHub Actions)
        changed_files_str = os.environ.get('CHANGED_FILES', '')
        changed_files = [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()]
    # Statistics
    processed_count = 0
    skipped_count = 0
    skipped_too_short = 0
    skipped_ignored = 0
-    updated_files = []  # Track actually updated files
+    updated_files = []
    print("🤖 Processing changed markdown files...\n")
-    print(f"🚫 Ignored folders: {', '.join(IGNORED_FOLDERS)}\n")
+    print(f"� Ignored folders: {', '.join(IGNORED_FOLDERS)}\n")
    # Process each file
    for filepath in changed_files:
-        if not filepath.endswith('.md'):
+        processed, skipped, skip_reason = process_file(filepath, IGNORED_FOLDERS)
            continue
-        # Check if file is in ignored folder
+        if processed:
-        if is_file_ignored(filepath, IGNORED_FOLDERS):
+            processed_count += 1
-            print(f"📄 Processing: {filepath}")
+            updated_files.append(filepath)
-            print(f"   🚫 Skipped (ignored folder)\n")
+        elif skipped:
            skipped_ignored += 1
            skipped_count += 1
-            continue
+            if skip_reason == 'too_short':
        print(f"📄 Processing: {filepath}")
        try:
            # Read file
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
            # Check if content is too short (less than 200 characters)
            if is_content_too_short(content):
                print(f"   ⏭️  Skipped (content less than 200 characters)\n")
                skipped_too_short += 1
-                skipped_count += 1
+            elif skip_reason == 'ignored':
-                continue
+                skipped_ignored += 1
            # Check if already has SEO description
            if has_seo_description(content):
                print(f"   ⏭️  Skipped (already has SEO description)\n")
                skipped_count += 1
                continue
            # Generate description
            filename = os.path.basename(filepath)
            print(f"   🤖 Generating description...")
            description = generate_description(content, filename)
            print(f"   💡 Generated: {description}")
            # Add SEO tag
            updated_content = add_seo_description(content, description)
            # Write back
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(updated_content)
            print(f"   ✅ Updated successfully\n")
            processed_count += 1
            updated_files.append(filepath)  # Track this file as updated
        except Exception as e:
            print(f"   ❌ Error: {e}\n")
    # Print summary
    print(f"\n📊 Summary:")
    print(f"   ✅ Updated: {processed_count}")
    print(f"   ⏭️  Skipped (total): {skipped_count}")
    print(f"   ⏭️  Skipped (too short): {skipped_too_short}")
    print(f"   🚫 Skipped (ignored folder): {skipped_ignored}")
-    # Save counts and updated files list for next step
+    # Save statistics
-    with open('/tmp/seo_stats.txt', 'w') as f:
+    save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored)
-        f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}")
+    save_updated_files(updated_files)
    # Save updated files list
    with open('/tmp/seo_updated_files.txt', 'w') as f:
        f.write('\n'.join(updated_files))
 if __name__ == '__main__':
    main()
--- a/.github/workflows/auto-add-seo.yml
+++ b/.github/workflows/auto-add-seo.yml
@ -52,8 +52,14 @@ jobs:
      - name: Get changed markdown files from merged PR
        id: changed-files
        run: |
-          # Get files changed in the merged PR (only Added and Modified, exclude Deleted)
+          # Get the list of commits in the PR
-          FILES=$(git diff --name-only --diff-filter=AM ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.merge_commit_sha }} | grep 'docs/en/.*\.md$' || true)
+          PR_HEAD_SHA="${{ github.event.pull_request.head.sha }}"
          PR_BASE_SHA="${{ github.event.pull_request.base.sha }}"
          echo "PR commits range: $PR_BASE_SHA..$PR_HEAD_SHA"
          # Get files changed in the PR commits only (only Added and Modified, exclude Deleted)
          FILES=$(git diff --name-only --diff-filter=AM $PR_BASE_SHA..$PR_HEAD_SHA | grep 'docs/en/.*\.md$' || true)
          echo "Files changed in the merged PR (added/modified only):"
          echo "$FILES"