Browse Source

Refactor SEO description script and update workflow

Improved modularity and readability of add_seo_descriptions.py by refactoring functions, enhancing block handling, and adding statistics saving. Updated auto-add-seo.yml to correctly detect changed markdown files using PR head and base SHAs for more accurate file selection.
pull/24000/head
SALİH ÖZKARA 4 months ago
parent
commit
da2d2924ba
  1. 285
      .github/scripts/add_seo_descriptions.py
  2. 10
      .github/workflows/auto-add-seo.yml

285
.github/scripts/add_seo_descriptions.py

@ -1,22 +1,22 @@
import os import os
import sys import sys
import re import re
import json
from openai import OpenAI from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_API_KEY']) client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
# Regex patterns as constants
SEO_BLOCK_PATTERN = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+'
SEO_BLOCK_WITH_BACKTICKS_PATTERN = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1'
def has_seo_description(content): def has_seo_description(content):
"""Check if content already has SEO description with Description field""" """Check if content already has SEO description with Description field"""
import json match = re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL)
# Match SEO description block with 3 or more backticks
pattern = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+'
match = re.search(pattern, content, flags=re.DOTALL)
if not match: if not match:
return False return False
# Check if Description field exists and is not empty
try: try:
json_str = match.group(1) json_str = match.group(1)
seo_data = json.loads(json_str) seo_data = json.loads(json_str)
@ -24,24 +24,42 @@ def has_seo_description(content):
except json.JSONDecodeError: except json.JSONDecodeError:
return False return False
def is_content_too_short(content): def has_seo_block(content):
"""Check if content is less than 200 characters""" """Check if content has any SEO block (with or without Description)"""
# Remove SEO tags if present for accurate count return bool(re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL))
# Match SEO description block with 3 or more backticks
clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL) def remove_seo_blocks(content):
"""Remove all SEO description blocks from content"""
return len(clean_content.strip()) < 200 return re.sub(SEO_BLOCK_PATTERN + r'\s*', '', content, flags=re.DOTALL)
def is_content_too_short(content, min_length=200):
"""Check if content is less than minimum length (excluding SEO blocks)"""
clean_content = remove_seo_blocks(content)
return len(clean_content.strip()) < min_length
def get_content_preview(content, max_length=1000): def get_content_preview(content, max_length=1000):
"""Get preview of content for OpenAI""" """Get preview of content for OpenAI (excluding SEO blocks)"""
# Remove existing SEO tags if present clean_content = remove_seo_blocks(content)
# Match SEO description block with 3 or more backticks
clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL)
return clean_content[:max_length].strip() return clean_content[:max_length].strip()
def escape_json_string(text):
"""Escape special characters for JSON"""
return text.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
def create_seo_block(description):
"""Create a new SEO block with the given description"""
escaped_desc = escape_json_string(description)
return f'''```json
//[doc-seo]
{{
"Description": "{escaped_desc}"
}}
```
'''
def generate_description(content, filename): def generate_description(content, filename):
"""Generate SEO description using OpenAI with system prompt from OpenAIService.cs""" """Generate SEO description using OpenAI"""
try: try:
preview = get_content_preview(content) preview = get_content_preview(content)
@ -69,154 +87,169 @@ Generate only the description text, nothing else:"""}
) )
description = response.choices[0].message.content.strip() description = response.choices[0].message.content.strip()
return description return description
except Exception as e: except Exception as e:
print(f"❌ Error generating description: {e}") print(f"❌ Error generating description: {e}")
return f"Learn about {os.path.splitext(filename)[0]} in ABP Framework documentation." return f"Learn about {os.path.splitext(filename)[0]} in ABP Framework documentation."
def add_seo_description(content, description): def update_seo_description(content, description):
"""Add or update SEO description in content""" """Update existing SEO block with new description"""
import json match = re.search(SEO_BLOCK_WITH_BACKTICKS_PATTERN, content, flags=re.DOTALL)
# Escape special characters for JSON if not match:
escaped_desc = description.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') return None
# Check if SEO block already exists backticks = match.group(1)
pattern = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1' json_str = match.group(2)
match = re.search(pattern, content, flags=re.DOTALL)
if match: try:
# SEO block exists, update Description field seo_data = json.loads(json_str)
backticks = match.group(1) seo_data['Description'] = description
json_str = match.group(2) updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False)
try: new_block = f'''{backticks}json
# Parse existing JSON
seo_data = json.loads(json_str)
# Update Description
seo_data['Description'] = description
# Convert back to formatted JSON
updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False)
# Replace the old block with updated one
new_block = f'''{backticks}json
//[doc-seo] //[doc-seo]
{updated_json} {updated_json}
{backticks}''' {backticks}'''
return re.sub(pattern, new_block, content, count=1, flags=re.DOTALL) return re.sub(SEO_BLOCK_WITH_BACKTICKS_PATTERN, new_block, content, count=1, flags=re.DOTALL)
except json.JSONDecodeError: except json.JSONDecodeError:
# If JSON is invalid, replace the whole block return None
pass
# No existing block or invalid JSON, add new block at the beginning
seo_tag = f'''```json
//[doc-seo]
{{
"Description": "{escaped_desc}"
}}
```
''' def add_seo_description(content, description):
return seo_tag + content """Add or update SEO description in content"""
# Try to update existing block first
updated_content = update_seo_description(content, description)
if updated_content:
return updated_content
# No existing block or update failed, add new block at the beginning
return create_seo_block(description) + content
def is_file_ignored(filepath, ignored_folders): def is_file_ignored(filepath, ignored_folders):
"""Check if file is in an ignored folder""" """Check if file is in an ignored folder"""
path_parts = filepath.split('/') path_parts = filepath.split('/')
for ignored in ignored_folders: return any(ignored in path_parts for ignored in ignored_folders)
if ignored in path_parts:
return True def get_changed_files():
return False """Get changed files from command line or environment variable"""
if len(sys.argv) > 1:
return sys.argv[1:]
changed_files_str = os.environ.get('CHANGED_FILES', '')
return [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()]
def process_file(filepath, ignored_folders):
"""Process a single markdown file. Returns (processed, skipped, skip_reason)"""
if not filepath.endswith('.md'):
return False, False, None
# Check if file is in ignored folder
if is_file_ignored(filepath, ignored_folders):
print(f"📄 Processing: {filepath}")
print(f" 🚫 Skipped (ignored folder)\n")
return False, True, 'ignored'
print(f"📄 Processing: {filepath}")
try:
# Read file with original line endings
with open(filepath, 'r', encoding='utf-8', newline='') as f:
content = f.read()
# Check if content is too short
if is_content_too_short(content):
print(f" ⏭️ Skipped (content less than 200 characters)\n")
return False, True, 'too_short'
# Check if already has SEO description
if has_seo_description(content):
print(f" ⏭️ Skipped (already has SEO description)\n")
return False, True, 'has_description'
# Generate description
filename = os.path.basename(filepath)
print(f" 🤖 Generating description...")
description = generate_description(content, filename)
print(f" 💡 Generated: {description}")
# Add or update SEO description
if has_seo_block(content):
print(f" 🔄 Updating existing SEO block...")
else:
print(f" ➕ Adding new SEO block...")
updated_content = add_seo_description(content, description)
# Write back (preserving line endings)
with open(filepath, 'w', encoding='utf-8', newline='') as f:
f.write(updated_content)
print(f" ✅ Updated successfully\n")
return True, False, None
except Exception as e:
print(f" ❌ Error: {e}\n")
return False, False, None
def save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored):
"""Save processing statistics to file"""
try:
with open('/tmp/seo_stats.txt', 'w') as f:
f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}")
except Exception as e:
print(f"⚠️ Warning: Could not save statistics: {e}")
def save_updated_files(updated_files):
"""Save list of updated files"""
try:
with open('/tmp/seo_updated_files.txt', 'w') as f:
f.write('\n'.join(updated_files))
except Exception as e:
print(f"⚠️ Warning: Could not save updated files list: {e}")
def main(): def main():
# Ignored folders from GitHub variable (or default values) # Get ignored folders from environment
IGNORED_FOLDERS_STR = os.environ.get('IGNORED_FOLDERS', 'Blog-Posts,Community-Articles,_deleted,_resources') IGNORED_FOLDERS_STR = os.environ.get('IGNORED_FOLDERS', 'Blog-Posts,Community-Articles,_deleted,_resources')
IGNORED_FOLDERS = [folder.strip() for folder in IGNORED_FOLDERS_STR.split(',') if folder.strip()] IGNORED_FOLDERS = [folder.strip() for folder in IGNORED_FOLDERS_STR.split(',') if folder.strip()]
# Get changed files from environment or command line # Get changed files
if len(sys.argv) > 1: changed_files = get_changed_files()
# Files passed as command line arguments
changed_files = sys.argv[1:]
else:
# Files from environment variable (for GitHub Actions)
changed_files_str = os.environ.get('CHANGED_FILES', '')
changed_files = [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()]
# Statistics
processed_count = 0 processed_count = 0
skipped_count = 0 skipped_count = 0
skipped_too_short = 0 skipped_too_short = 0
skipped_ignored = 0 skipped_ignored = 0
updated_files = [] # Track actually updated files updated_files = []
print("🤖 Processing changed markdown files...\n") print("🤖 Processing changed markdown files...\n")
print(f"🚫 Ignored folders: {', '.join(IGNORED_FOLDERS)}\n") print(f" Ignored folders: {', '.join(IGNORED_FOLDERS)}\n")
# Process each file
for filepath in changed_files: for filepath in changed_files:
if not filepath.endswith('.md'): processed, skipped, skip_reason = process_file(filepath, IGNORED_FOLDERS)
continue
# Check if file is in ignored folder if processed:
if is_file_ignored(filepath, IGNORED_FOLDERS): processed_count += 1
print(f"📄 Processing: {filepath}") updated_files.append(filepath)
print(f" 🚫 Skipped (ignored folder)\n") elif skipped:
skipped_ignored += 1
skipped_count += 1 skipped_count += 1
continue if skip_reason == 'too_short':
print(f"📄 Processing: {filepath}")
try:
# Read file
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# Check if content is too short (less than 200 characters)
if is_content_too_short(content):
print(f" ⏭️ Skipped (content less than 200 characters)\n")
skipped_too_short += 1 skipped_too_short += 1
skipped_count += 1 elif skip_reason == 'ignored':
continue skipped_ignored += 1
# Check if already has SEO description
if has_seo_description(content):
print(f" ⏭️ Skipped (already has SEO description)\n")
skipped_count += 1
continue
# Generate description
filename = os.path.basename(filepath)
print(f" 🤖 Generating description...")
description = generate_description(content, filename)
print(f" 💡 Generated: {description}")
# Add SEO tag
updated_content = add_seo_description(content, description)
# Write back
with open(filepath, 'w', encoding='utf-8') as f:
f.write(updated_content)
print(f" ✅ Updated successfully\n")
processed_count += 1
updated_files.append(filepath) # Track this file as updated
except Exception as e:
print(f" ❌ Error: {e}\n")
# Print summary
print(f"\n📊 Summary:") print(f"\n📊 Summary:")
print(f" ✅ Updated: {processed_count}") print(f" ✅ Updated: {processed_count}")
print(f" ⏭️ Skipped (total): {skipped_count}") print(f" ⏭️ Skipped (total): {skipped_count}")
print(f" ⏭️ Skipped (too short): {skipped_too_short}") print(f" ⏭️ Skipped (too short): {skipped_too_short}")
print(f" 🚫 Skipped (ignored folder): {skipped_ignored}") print(f" 🚫 Skipped (ignored folder): {skipped_ignored}")
# Save counts and updated files list for next step # Save statistics
with open('/tmp/seo_stats.txt', 'w') as f: save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored)
f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}") save_updated_files(updated_files)
# Save updated files list
with open('/tmp/seo_updated_files.txt', 'w') as f:
f.write('\n'.join(updated_files))
if __name__ == '__main__': if __name__ == '__main__':
main() main()

10
.github/workflows/auto-add-seo.yml

@ -52,8 +52,14 @@ jobs:
- name: Get changed markdown files from merged PR - name: Get changed markdown files from merged PR
id: changed-files id: changed-files
run: | run: |
# Get files changed in the merged PR (only Added and Modified, exclude Deleted) # Get the list of commits in the PR
FILES=$(git diff --name-only --diff-filter=AM ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.merge_commit_sha }} | grep 'docs/en/.*\.md$' || true) PR_HEAD_SHA="${{ github.event.pull_request.head.sha }}"
PR_BASE_SHA="${{ github.event.pull_request.base.sha }}"
echo "PR commits range: $PR_BASE_SHA..$PR_HEAD_SHA"
# Get files changed in the PR commits only (only Added and Modified, exclude Deleted)
FILES=$(git diff --name-only --diff-filter=AM $PR_BASE_SHA..$PR_HEAD_SHA | grep 'docs/en/.*\.md$' || true)
echo "Files changed in the merged PR (added/modified only):" echo "Files changed in the merged PR (added/modified only):"
echo "$FILES" echo "$FILES"

Loading…
Cancel
Save