diff --git a/.github/scripts/add_seo_descriptions.py b/.github/scripts/add_seo_descriptions.py index 4475c809bf..25fd508479 100644 --- a/.github/scripts/add_seo_descriptions.py +++ b/.github/scripts/add_seo_descriptions.py @@ -1,22 +1,22 @@ import os import sys import re +import json from openai import OpenAI client = OpenAI(api_key=os.environ['OPENAI_API_KEY']) +# Regex patterns as constants +SEO_BLOCK_PATTERN = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+' +SEO_BLOCK_WITH_BACKTICKS_PATTERN = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1' + def has_seo_description(content): """Check if content already has SEO description with Description field""" - import json - - # Match SEO description block with 3 or more backticks - pattern = r'```+json\s*//\[doc-seo\]\s*(\{.*?\})\s*```+' - match = re.search(pattern, content, flags=re.DOTALL) + match = re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL) if not match: return False - # Check if Description field exists and is not empty try: json_str = match.group(1) seo_data = json.loads(json_str) @@ -24,24 +24,42 @@ def has_seo_description(content): except json.JSONDecodeError: return False -def is_content_too_short(content): - """Check if content is less than 200 characters""" - # Remove SEO tags if present for accurate count - # Match SEO description block with 3 or more backticks - clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL) - - return len(clean_content.strip()) < 200 +def has_seo_block(content): + """Check if content has any SEO block (with or without Description)""" + return bool(re.search(SEO_BLOCK_PATTERN, content, flags=re.DOTALL)) + +def remove_seo_blocks(content): + """Remove all SEO description blocks from content""" + return re.sub(SEO_BLOCK_PATTERN + r'\s*', '', content, flags=re.DOTALL) + +def is_content_too_short(content, min_length=200): + """Check if content is less than minimum length (excluding SEO blocks)""" + clean_content = remove_seo_blocks(content) + return len(clean_content.strip()) < min_length def get_content_preview(content, max_length=1000): - """Get preview of content for OpenAI""" - # Remove existing SEO tags if present - # Match SEO description block with 3 or more backticks - clean_content = re.sub(r'```+json\s*//\[doc-seo\].*?```+\s*', '', content, flags=re.DOTALL) - + """Get preview of content for OpenAI (excluding SEO blocks)""" + clean_content = remove_seo_blocks(content) return clean_content[:max_length].strip() +def escape_json_string(text): + """Escape special characters for JSON""" + return text.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') + +def create_seo_block(description): + """Create a new SEO block with the given description""" + escaped_desc = escape_json_string(description) + return f'''```json +//[doc-seo] +{{ + "Description": "{escaped_desc}" +}} +``` + +''' + def generate_description(content, filename): - """Generate SEO description using OpenAI with system prompt from OpenAIService.cs""" + """Generate SEO description using OpenAI""" try: preview = get_content_preview(content) @@ -69,154 +87,169 @@ Generate only the description text, nothing else:"""} ) description = response.choices[0].message.content.strip() - return description except Exception as e: print(f"āŒ Error generating description: {e}") return f"Learn about {os.path.splitext(filename)[0]} in ABP Framework documentation." -def add_seo_description(content, description): - """Add or update SEO description in content""" - import json +def update_seo_description(content, description): + """Update existing SEO block with new description""" + match = re.search(SEO_BLOCK_WITH_BACKTICKS_PATTERN, content, flags=re.DOTALL) - # Escape special characters for JSON - escaped_desc = description.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') + if not match: + return None - # Check if SEO block already exists - pattern = r'(```+)json\s*//\[doc-seo\]\s*(\{.*?\})\s*\1' - match = re.search(pattern, content, flags=re.DOTALL) + backticks = match.group(1) + json_str = match.group(2) - if match: - # SEO block exists, update Description field - backticks = match.group(1) - json_str = match.group(2) + try: + seo_data = json.loads(json_str) + seo_data['Description'] = description + updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False) - try: - # Parse existing JSON - seo_data = json.loads(json_str) - # Update Description - seo_data['Description'] = description - # Convert back to formatted JSON - updated_json = json.dumps(seo_data, indent=4, ensure_ascii=False) - - # Replace the old block with updated one - new_block = f'''{backticks}json + new_block = f'''{backticks}json //[doc-seo] {updated_json} {backticks}''' - - return re.sub(pattern, new_block, content, count=1, flags=re.DOTALL) - except json.JSONDecodeError: - # If JSON is invalid, replace the whole block - pass - - # No existing block or invalid JSON, add new block at the beginning - seo_tag = f'''```json -//[doc-seo] -{{ - "Description": "{escaped_desc}" -}} -``` + + return re.sub(SEO_BLOCK_WITH_BACKTICKS_PATTERN, new_block, content, count=1, flags=re.DOTALL) + except json.JSONDecodeError: + return None -''' - return seo_tag + content +def add_seo_description(content, description): + """Add or update SEO description in content""" + # Try to update existing block first + updated_content = update_seo_description(content, description) + if updated_content: + return updated_content + + # No existing block or update failed, add new block at the beginning + return create_seo_block(description) + content def is_file_ignored(filepath, ignored_folders): """Check if file is in an ignored folder""" path_parts = filepath.split('/') - for ignored in ignored_folders: - if ignored in path_parts: - return True - return False + return any(ignored in path_parts for ignored in ignored_folders) + +def get_changed_files(): + """Get changed files from command line or environment variable""" + if len(sys.argv) > 1: + return sys.argv[1:] + + changed_files_str = os.environ.get('CHANGED_FILES', '') + return [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()] + +def process_file(filepath, ignored_folders): + """Process a single markdown file. Returns (processed, skipped, skip_reason)""" + if not filepath.endswith('.md'): + return False, False, None + + # Check if file is in ignored folder + if is_file_ignored(filepath, ignored_folders): + print(f"šŸ“„ Processing: {filepath}") + print(f" 🚫 Skipped (ignored folder)\n") + return False, True, 'ignored' + + print(f"šŸ“„ Processing: {filepath}") + + try: + # Read file with original line endings + with open(filepath, 'r', encoding='utf-8', newline='') as f: + content = f.read() + + # Check if content is too short + if is_content_too_short(content): + print(f" ā­ļø Skipped (content less than 200 characters)\n") + return False, True, 'too_short' + + # Check if already has SEO description + if has_seo_description(content): + print(f" ā­ļø Skipped (already has SEO description)\n") + return False, True, 'has_description' + + # Generate description + filename = os.path.basename(filepath) + print(f" šŸ¤– Generating description...") + description = generate_description(content, filename) + print(f" šŸ’” Generated: {description}") + + # Add or update SEO description + if has_seo_block(content): + print(f" šŸ”„ Updating existing SEO block...") + else: + print(f" āž• Adding new SEO block...") + + updated_content = add_seo_description(content, description) + + # Write back (preserving line endings) + with open(filepath, 'w', encoding='utf-8', newline='') as f: + f.write(updated_content) + + print(f" āœ… Updated successfully\n") + return True, False, None + + except Exception as e: + print(f" āŒ Error: {e}\n") + return False, False, None + +def save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored): + """Save processing statistics to file""" + try: + with open('/tmp/seo_stats.txt', 'w') as f: + f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}") + except Exception as e: + print(f"āš ļø Warning: Could not save statistics: {e}") + +def save_updated_files(updated_files): + """Save list of updated files""" + try: + with open('/tmp/seo_updated_files.txt', 'w') as f: + f.write('\n'.join(updated_files)) + except Exception as e: + print(f"āš ļø Warning: Could not save updated files list: {e}") def main(): - # Ignored folders from GitHub variable (or default values) + # Get ignored folders from environment IGNORED_FOLDERS_STR = os.environ.get('IGNORED_FOLDERS', 'Blog-Posts,Community-Articles,_deleted,_resources') IGNORED_FOLDERS = [folder.strip() for folder in IGNORED_FOLDERS_STR.split(',') if folder.strip()] - # Get changed files from environment or command line - if len(sys.argv) > 1: - # Files passed as command line arguments - changed_files = sys.argv[1:] - else: - # Files from environment variable (for GitHub Actions) - changed_files_str = os.environ.get('CHANGED_FILES', '') - changed_files = [f.strip() for f in changed_files_str.strip().split('\n') if f.strip()] + # Get changed files + changed_files = get_changed_files() + # Statistics processed_count = 0 skipped_count = 0 skipped_too_short = 0 skipped_ignored = 0 - updated_files = [] # Track actually updated files + updated_files = [] print("šŸ¤– Processing changed markdown files...\n") - print(f"🚫 Ignored folders: {', '.join(IGNORED_FOLDERS)}\n") + print(f"ļæ½ Ignored folders: {', '.join(IGNORED_FOLDERS)}\n") + # Process each file for filepath in changed_files: - if not filepath.endswith('.md'): - continue + processed, skipped, skip_reason = process_file(filepath, IGNORED_FOLDERS) - # Check if file is in ignored folder - if is_file_ignored(filepath, IGNORED_FOLDERS): - print(f"šŸ“„ Processing: {filepath}") - print(f" 🚫 Skipped (ignored folder)\n") - skipped_ignored += 1 + if processed: + processed_count += 1 + updated_files.append(filepath) + elif skipped: skipped_count += 1 - continue - - print(f"šŸ“„ Processing: {filepath}") - - try: - # Read file - with open(filepath, 'r', encoding='utf-8') as f: - content = f.read() - - # Check if content is too short (less than 200 characters) - if is_content_too_short(content): - print(f" ā­ļø Skipped (content less than 200 characters)\n") + if skip_reason == 'too_short': skipped_too_short += 1 - skipped_count += 1 - continue - - # Check if already has SEO description - if has_seo_description(content): - print(f" ā­ļø Skipped (already has SEO description)\n") - skipped_count += 1 - continue - - # Generate description - filename = os.path.basename(filepath) - print(f" šŸ¤– Generating description...") - description = generate_description(content, filename) - print(f" šŸ’” Generated: {description}") - - # Add SEO tag - updated_content = add_seo_description(content, description) - - # Write back - with open(filepath, 'w', encoding='utf-8') as f: - f.write(updated_content) - - print(f" āœ… Updated successfully\n") - processed_count += 1 - updated_files.append(filepath) # Track this file as updated - - except Exception as e: - print(f" āŒ Error: {e}\n") + elif skip_reason == 'ignored': + skipped_ignored += 1 + # Print summary print(f"\nšŸ“Š Summary:") print(f" āœ… Updated: {processed_count}") print(f" ā­ļø Skipped (total): {skipped_count}") print(f" ā­ļø Skipped (too short): {skipped_too_short}") print(f" 🚫 Skipped (ignored folder): {skipped_ignored}") - # Save counts and updated files list for next step - with open('/tmp/seo_stats.txt', 'w') as f: - f.write(f"{processed_count}\n{skipped_count}\n{skipped_too_short}\n{skipped_ignored}") - - # Save updated files list - with open('/tmp/seo_updated_files.txt', 'w') as f: - f.write('\n'.join(updated_files)) + # Save statistics + save_statistics(processed_count, skipped_count, skipped_too_short, skipped_ignored) + save_updated_files(updated_files) if __name__ == '__main__': main() diff --git a/.github/workflows/auto-add-seo.yml b/.github/workflows/auto-add-seo.yml index 993196cb86..b8829f869c 100644 --- a/.github/workflows/auto-add-seo.yml +++ b/.github/workflows/auto-add-seo.yml @@ -52,8 +52,14 @@ jobs: - name: Get changed markdown files from merged PR id: changed-files run: | - # Get files changed in the merged PR (only Added and Modified, exclude Deleted) - FILES=$(git diff --name-only --diff-filter=AM ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.merge_commit_sha }} | grep 'docs/en/.*\.md$' || true) + # Get the list of commits in the PR + PR_HEAD_SHA="${{ github.event.pull_request.head.sha }}" + PR_BASE_SHA="${{ github.event.pull_request.base.sha }}" + + echo "PR commits range: $PR_BASE_SHA..$PR_HEAD_SHA" + + # Get files changed in the PR commits only (only Added and Modified, exclude Deleted) + FILES=$(git diff --name-only --diff-filter=AM $PR_BASE_SHA..$PR_HEAD_SHA | grep 'docs/en/.*\.md$' || true) echo "Files changed in the merged PR (added/modified only):" echo "$FILES"