Merge pull request #24000 from abpframework/salihozkara/seo

Refactor SEO description script and update workflow
7 months ago · dbfd69f882
3 changed files with 168 additions and 129 deletions
--- a/.github/scripts/add_seo_descriptions.py
+++ b/.github/scripts/add_seo_descriptions.py
@ -1,22 +1,22 @@
 import os
 import sys
 import re
+import json
 from openai import OpenAI

 client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

+# Regex patterns as constants
+SEO_BLOCK_PATTERN = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+'
+SEO_BLOCK_WITH_BACKTICKS_PATTERN = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1'
+
 def has_seo_description(content):
    """Check if content already has SEO description with Description field"""
-    import json
-    
-    # Match SEO description block with 3 or more backticks
-    pattern = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+'
-    match = re.search(pattern, content, flags=re.DOTALL)
+    match = re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL)
    
    if not match:
        return False
    
-    # Check if Description field exists and is not empty
    try:
        json_str = match.group(1)
        seo_data = json.loads(json_str)
@ -24,24 +24,42 @@ def has_seo_description(content):
    except json.JSONDecodeError:
        return False

-def is_content_too_short(content):
-    """Check if content is less than 200 characters"""
-    # Remove SEO tags if present for accurate count
-    # Match SEO description block with 3 or more backticks
-    clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL)
-    
-    return len(clean_content.strip()) < 200
+def has_seo_block(content):
+    """Check if content has any SEO block (with or without Description)"""
+    return bool(re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL))
+
+def remove_seo_blocks(content):
+    """Remove all SEO description blocks from content"""
+    return re.sub(SEO_BLOCK_PATTERN + r'\s*', '', content, flags=re.DOTALL)
+
+def is_content_too_short(content, min_length=200):
+    """Check if content is less than minimum length (excluding SEO blocks)"""
+    clean_content = remove_seo_blocks(content)
+    return len(clean_content.strip()) < min_length

 def get_content_preview(content, max_length=1000):
-    """Get preview of content for OpenAI"""
-    # Remove existing SEO tags if present
-    # Match SEO description block with 3 or more backticks
-    clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL)
-    
+    """Get preview of content for OpenAI (excluding SEO blocks)"""
+    clean_content = remove_seo_blocks(content)
    return clean_content[:max_length].strip()

+def escape_json_string(text):
+    """Escape special characters for JSON"""
+    return text.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
+
+def create_seo_block(description):
+    """Create a new SEO block with the given description"""
+    escaped_desc = escape_json_string(description)
+    return f'''```json
+//[doc-seo]
+{{
+    "Description": "{escaped_desc}"
+}}
+```
+
+'''
+
 def generate_description(content, filename):
-    """Generate SEO description using OpenAI with system prompt from OpenAIService.cs"""
+    """Generate SEO description using OpenAI"""
    try:
        preview = get_content_preview(content)
        
@ -69,154 +87,169 @@ Generate only the description text, nothing else:"""}
        )
        
        description = response.choices[0].message.content.strip()
-        
        return description
    except Exception as e:
        print(f"❌ Error generating description: {e}")
        return f"Learn about {os.path.splitext(filename)[0]} in ABP Framework documentation."

-def add_seo_description(content, description):
-    """Add or update SEO description in content"""
-    import json
+def update_seo_description(content, description):
+    """Update existing SEO block with new description"""
+    match = re.search(SEO_BLOCK_WITH_BACKTICKS_PATTERN, content, flags=re.DOTALL)
    
-    # Escape special characters for JSON
-    escaped_desc = description.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
+    if not match:
+        return None
    
-    # Check if SEO block already exists
-    pattern = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1'
-    match = re.search(pattern, content, flags=re.DOTALL)
+    backticks = match.group(1)
+    json_str = match.group(2)
    
-    if match:
-        # SEO block exists, update Description field
-        backticks = match.group(1)
-        json_str = match.group(2)
+    try:
+        seo_data = json.loads(json_str)
+        seo_data['Description'] = description
+        updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False)
        
-        try:
-            # Parse existing JSON
-            seo_data = json.loads(json_str)
-            # Update Description
-            seo_data['Description'] = description
-            # Convert back to formatted JSON
-            updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False)
-            
-            # Replace the old block with updated one
-            new_block = f'''{backticks}json
+        new_block = f'''{backticks}json
 //[doc-seo]
 {updated_json}
 {backticks}'''
-            
-            return re.sub(pattern, new_block, content, count=1, flags=re.DOTALL)
-        except json.JSONDecodeError:
-            # If JSON is invalid, replace the whole block
-            pass
-    
-    # No existing block or invalid JSON, add new block at the beginning
-    seo_tag = f'''```json
-//[doc-seo]
-{{
-    "Description": "{escaped_desc}"
-}}
-```
+        
+        return re.sub(SEO_BLOCK_WITH_BACKTICKS_PATTERN, new_block, content, count=1, flags=re.DOTALL)
+    except json.JSONDecodeError:
+        return None

-'''
-    return seo_tag + content
+def add_seo_description(content, description):
+    """Add or update SEO description in content"""
+    # Try to update existing block first
+    updated_content = update_seo_description(content, description)
+    if updated_content:
+        return updated_content
+    
+    # No existing block or update failed, add new block at the beginning
+    return create_seo_block(description) + content

 def is_file_ignored(filepath, ignored_folders):
    """Check if file is in an ignored folder"""
    path_parts = filepath.split('/')
-    for ignored in ignored_folders:
-        if ignored in path_parts:
-            return True
-    return False
+    return any(ignored in path_parts for ignored in ignored_folders)
+
+def get_changed_files():
+    """Get changed files from command line or environment variable"""
+    if len(sys.argv) > 1:
+        return sys.argv[1:]
+    
+    changed_files_str = os.environ.get('CHANGED_FILES', '')
+    return [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()]
+
+def process_file(filepath, ignored_folders):
+    """Process a single markdown file. Returns (processed, skipped, skip_reason)"""
+    if not filepath.endswith('.md'):
+        return False, False, None
+    
+    # Check if file is in ignored folder
+    if is_file_ignored(filepath, ignored_folders):
+        print(f"📄 Processing: {filepath}")
+        print(f"   🚫 Skipped (ignored folder)\n")
+        return False, True, 'ignored'
+    
+    print(f"📄 Processing: {filepath}")
+    
+    try:
+        # Read file with original line endings
+        with open(filepath, 'r', encoding='utf-8', newline='') as f:
+            content = f.read()
+        
+        # Check if content is too short
+        if is_content_too_short(content):
+            print(f"   ⏭️  Skipped (content less than 200 characters)\n")
+            return False, True, 'too_short'
+        
+        # Check if already has SEO description
+        if has_seo_description(content):
+            print(f"   ⏭️  Skipped (already has SEO description)\n")
+            return False, True, 'has_description'
+        
+        # Generate description
+        filename = os.path.basename(filepath)
+        print(f"   🤖 Generating description...")
+        description = generate_description(content, filename)
+        print(f"   💡 Generated: {description}")
+        
+        # Add or update SEO description
+        if has_seo_block(content):
+            print(f"   🔄 Updating existing SEO block...")
+        else:
+            print(f"   ➕ Adding new SEO block...")
+        
+        updated_content = add_seo_description(content, description)
+        
+        # Write back (preserving line endings)
+        with open(filepath, 'w', encoding='utf-8', newline='') as f:
+            f.write(updated_content)
+        
+        print(f"   ✅ Updated successfully\n")
+        return True, False, None
+        
+    except Exception as e:
+        print(f"   ❌ Error: {e}\n")
+        return False, False, None
+
+def save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored):
+    """Save processing statistics to file"""
+    try:
+        with open('/tmp/seo_stats.txt', 'w') as f:
+            f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}")
+    except Exception as e:
+        print(f"⚠️  Warning: Could not save statistics: {e}")
+
+def save_updated_files(updated_files):
+    """Save list of updated files"""
+    try:
+        with open('/tmp/seo_updated_files.txt', 'w') as f:
+            f.write('\n'.join(updated_files))
+    except Exception as e:
+        print(f"⚠️  Warning: Could not save updated files list: {e}")

 def main():
-    # Ignored folders from GitHub variable (or default values)
+    # Get ignored folders from environment
    IGNORED_FOLDERS_STR = os.environ.get('IGNORED_FOLDERS', 'Blog-Posts,Community-Articles,_deleted,_resources')
    IGNORED_FOLDERS = [folder.strip() for folder in IGNORED_FOLDERS_STR.split(',') if folder.strip()]
    
-    # Get changed files from environment or command line
-    if len(sys.argv) > 1:
-        # Files passed as command line arguments
-        changed_files = sys.argv[1:]
-    else:
-        # Files from environment variable (for GitHub Actions)
-        changed_files_str = os.environ.get('CHANGED_FILES', '')
-        changed_files = [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()]
+    # Get changed files
+    changed_files = get_changed_files()
    
+    # Statistics
    processed_count = 0
    skipped_count = 0
    skipped_too_short = 0
    skipped_ignored = 0
-    updated_files = []  # Track actually updated files
+    updated_files = []
    
    print("🤖 Processing changed markdown files...\n")
-    print(f"🚫 Ignored folders: {', '.join(IGNORED_FOLDERS)}\n")
+    print(f"� Ignored folders: {', '.join(IGNORED_FOLDERS)}\n")
    
+    # Process each file
    for filepath in changed_files:
-        if not filepath.endswith('.md'):
-            continue
+        processed, skipped, skip_reason = process_file(filepath, IGNORED_FOLDERS)
        
-        # Check if file is in ignored folder
-        if is_file_ignored(filepath, IGNORED_FOLDERS):
-            print(f"📄 Processing: {filepath}")
-            print(f"   🚫 Skipped (ignored folder)\n")
-            skipped_ignored += 1
+        if processed:
+            processed_count += 1
+            updated_files.append(filepath)
+        elif skipped:
            skipped_count += 1
-            continue
-            
-        print(f"📄 Processing: {filepath}")
-        
-        try:
-            # Read file
-            with open(filepath, 'r', encoding='utf-8') as f:
-                content = f.read()
-            
-            # Check if content is too short (less than 200 characters)
-            if is_content_too_short(content):
-                print(f"   ⏭️  Skipped (content less than 200 characters)\n")
+            if skip_reason == 'too_short':
                skipped_too_short += 1
-                skipped_count += 1
-                continue
-            
-            # Check if already has SEO description
-            if has_seo_description(content):
-                print(f"   ⏭️  Skipped (already has SEO description)\n")
-                skipped_count += 1
-                continue
-            
-            # Generate description
-            filename = os.path.basename(filepath)
-            print(f"   🤖 Generating description...")
-            description = generate_description(content, filename)
-            print(f"   💡 Generated: {description}")
-            
-            # Add SEO tag
-            updated_content = add_seo_description(content, description)
-            
-            # Write back
-            with open(filepath, 'w', encoding='utf-8') as f:
-                f.write(updated_content)
-            
-            print(f"   ✅ Updated successfully\n")
-            processed_count += 1
-            updated_files.append(filepath)  # Track this file as updated
-            
-        except Exception as e:
-            print(f"   ❌ Error: {e}\n")
+            elif skip_reason == 'ignored':
+                skipped_ignored += 1
    
+    # Print summary
    print(f"\n📊 Summary:")
    print(f"   ✅ Updated: {processed_count}")
    print(f"   ⏭️  Skipped (total): {skipped_count}")
    print(f"   ⏭️  Skipped (too short): {skipped_too_short}")
    print(f"   🚫 Skipped (ignored folder): {skipped_ignored}")
    
-    # Save counts and updated files list for next step
-    with open('/tmp/seo_stats.txt', 'w') as f:
-        f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}")
-    
-    # Save updated files list
-    with open('/tmp/seo_updated_files.txt', 'w') as f:
-        f.write('\n'.join(updated_files))
+    # Save statistics
+    save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored)
+    save_updated_files(updated_files)

 if __name__ == '__main__':
    main()
--- a/.github/workflows/auto-add-seo.yml
+++ b/.github/workflows/auto-add-seo.yml
@ -52,8 +52,14 @@ jobs:
      - name: Get changed markdown files from merged PR
        id: changed-files
        run: |
-          # Get files changed in the merged PR (only Added and Modified, exclude Deleted)
-          FILES=$(git diff --name-only --diff-filter=AM ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.merge_commit_sha }} | grep 'docs/en/.*\.md$' || true)
+          # Get the list of commits in the PR
+          PR_HEAD_SHA="${{ github.event.pull_request.head.sha }}"
+          PR_BASE_SHA="${{ github.event.pull_request.base.sha }}"
+          
+          echo "PR commits range: $PR_BASE_SHA..$PR_HEAD_SHA"
+          
+          # Get files changed in the PR commits only (only Added and Modified, exclude Deleted)
+          FILES=$(git diff --name-only --diff-filter=AM $PR_BASE_SHA..$PR_HEAD_SHA | grep 'docs/en/.*\.md$' || true)
          
          echo "Files changed in the merged PR (added/modified only):"
          echo "$FILES"
--- a/docs/en/cli/index.md
+++ b/docs/en/cli/index.md
@ -10,7 +10,7 @@
 ABP CLI (Command Line Interface) is a command line tool to perform some common operations for ABP based solutions or ABP Studio features.

 > With **v8.2+**, the old/legacy ABP CLI has been replaced with a new CLI system to align with the new templating system and [ABP Studio](../studio/index.md). The new ABP CLI commands are explained in this documentation. However, if you want to learn more about the differences between the old and new CLIs, want to learn the reason for the change, or need guidance to use the old ABP CLI, please refer to the [Old vs New CLI](differences-between-old-and-new-cli.md) documentation.
-
+>
 > You may need to remove the Old CLI before installing the New CLI, by running the following command: `dotnet tool uninstall -g Volo.Abp.Cli`

 ## Installation