""" Content Snippet Extractor Extracts text paragraphs and images from competitor websites Highlights keywords for content rewriting """ import json import re from competitor_analyzer import CompetitorAnalyzer from datetime import datetime from collections import defaultdict class ContentExtractor: def __init__(self, config_file='config.json'): """Initialize content extractor""" with open(config_file, 'r') as f: self.config = json.load(f) self.analyzer = CompetitorAnalyzer(config_file) # Load keywords from multiple sources self.keywords = [] # Load GSC keywords from CSV try: import pandas as pd csv_files = [ 'reports/keyword_opportunities.csv', 'reports/keyword_opportunities_20260109_011956.csv' ] for csv_file in csv_files: try: df = pd.read_csv(csv_file) gsc_keywords = df['query'].tolist() self.keywords.extend(gsc_keywords) print(f"[OK] Loaded {len(gsc_keywords)} keywords from {csv_file}") break except: continue except Exception as e: print(f"[WARNING] Could not load GSC keywords: {e}") # Load competitor keywords from JSON try: with open('reports/competitor_keywords.json', 'r') as f: keyword_data = json.load(f) comp_keywords = [kw['keyword'] for kw in keyword_data['keywords'][:100]] self.keywords.extend(comp_keywords) print(f"[OK] Loaded {len(comp_keywords)} competitor keywords") except: print("[WARNING] No competitor_keywords.json found.") # Remove duplicates and keep unique keywords self.keywords = list(set(self.keywords)) print(f"[OK] Total unique keywords: {len(self.keywords)}") def extract_content_from_competitors(self): """ Extract all content snippets from competitor websites """ print("\n" + "="*60) print("CONTENT SNIPPET EXTRACTION") print("="*60) competitor_urls = self.config.get('competitors', []) all_content = [] for i, url in enumerate(competitor_urls, 1): print(f"\n[{i}/{len(competitor_urls)}] Extracting from: {url}") content = self.analyzer.scrape_page_content(url) if content: # Extract paragraphs paragraphs = self.extract_paragraphs(content) # Extract images images = self.extract_images(url) # Organize by headings structured_content = self.structure_content(url, content, paragraphs, images) all_content.append(structured_content) print(f" [OK] Extracted {len(paragraphs)} paragraphs, {len(images)} images") # Export results self.export_content_snippets(all_content) return all_content def extract_paragraphs(self, content): """ Extract meaningful paragraphs from page content """ from bs4 import BeautifulSoup # Re-scrape to get structured HTML try: response = self.analyzer.session.get(content['url'], timeout=10) soup = BeautifulSoup(response.content, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): element.decompose() paragraphs = [] # Extract all

tags for p in soup.find_all('p'): text = p.get_text().strip() # Skip short or empty paragraphs if len(text) < 50: continue # Skip navigation/footer text if any(skip in text.lower() for skip in ['cookie', 'privacy policy', 'terms and conditions', 'all rights reserved']): continue paragraphs.append({ 'text': text, 'word_count': len(text.split()), 'has_keywords': self.count_keywords_in_text(text) }) # Also extract list items that are substantial for ul in soup.find_all(['ul', 'ol']): for li in ul.find_all('li'): text = li.get_text().strip() if len(text) > 30: paragraphs.append({ 'text': text, 'word_count': len(text.split()), 'has_keywords': self.count_keywords_in_text(text), 'is_list_item': True }) return paragraphs except Exception as e: print(f" [ERROR] Error extracting paragraphs: {e}") return [] def extract_images(self, url): """ Extract all image URLs from a page """ from bs4 import BeautifulSoup from urllib.parse import urljoin try: response = self.analyzer.session.get(url, timeout=10) soup = BeautifulSoup(response.content, 'html.parser') images = [] for img in soup.find_all('img'): src = img.get('src') or img.get('data-src') if src: # Convert relative URLs to absolute full_url = urljoin(url, src) # Skip tiny images (likely icons/logos) width = img.get('width') height = img.get('height') if width and height: try: if int(width) < 100 or int(height) < 100: continue except: pass images.append({ 'url': full_url, 'alt': img.get('alt', ''), 'title': img.get('title', '') }) return images except Exception as e: print(f" [ERROR] Error extracting images: {e}") return [] def structure_content(self, url, page_content, paragraphs, images): """ Organize content by headings and topics """ sections = [] current_section = { 'heading': 'Introduction', 'level': 1, 'paragraphs': [], 'images': [] } # Group paragraphs by headings for heading in page_content['headings']: if heading['level'] <= 2: # H1 and H2 # Save previous section if current_section['paragraphs']: sections.append(current_section) # Start new section current_section = { 'heading': heading['text'], 'level': heading['level'], 'paragraphs': [], 'images': [] } # Distribute paragraphs across sections # For simplicity, we'll just add all paragraphs to sections evenly if sections: para_per_section = len(paragraphs) // len(sections) if sections else len(paragraphs) for i, section in enumerate(sections): start_idx = i * para_per_section end_idx = start_idx + para_per_section section['paragraphs'] = paragraphs[start_idx:end_idx] else: current_section['paragraphs'] = paragraphs sections.append(current_section) # Distribute images if images and sections: img_per_section = len(images) // len(sections) if sections else len(images) for i, section in enumerate(sections): start_idx = i * img_per_section end_idx = start_idx + img_per_section section['images'] = images[start_idx:end_idx] return { 'url': url, 'title': page_content['title'], 'sections': sections, 'total_paragraphs': len(paragraphs), 'total_images': len(images) } def count_keywords_in_text(self, text): """ Count how many keywords appear in text """ text_lower = text.lower() found_keywords = [] for keyword in self.keywords[:50]: # Check top 50 keywords if keyword.lower() in text_lower: found_keywords.append(keyword) return found_keywords def highlight_keywords_in_text(self, text): """ Return text with keywords wrapped in tags """ highlighted = text # Sort keywords by length (longest first) to avoid partial replacements sorted_keywords = sorted(self.keywords[:50], key=len, reverse=True) for keyword in sorted_keywords: # Case-insensitive replacement pattern = re.compile(re.escape(keyword), re.IGNORECASE) highlighted = pattern.sub(f'{keyword}', highlighted) return highlighted def export_content_snippets(self, all_content): """ Export content snippets to HTML and JSON """ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Export JSON (timestamped and static) json_path_ts = f"reports/content_snippets_{timestamp}.json" json_path_static = "reports/content_snippets.json" json_data = { 'extraction_date': datetime.now().isoformat(), 'total_sites': len(all_content), 'total_paragraphs': sum(c['total_paragraphs'] for c in all_content), 'total_images': sum(c['total_images'] for c in all_content), 'content': all_content } with open(json_path_ts, 'w', encoding='utf-8') as f: json.dump(json_data, f, indent=2) with open(json_path_static, 'w', encoding='utf-8') as f: json.dump(json_data, f, indent=2) print(f"\n[OK] Saved JSON: {json_path_ts}") print(f"[OK] Saved JSON: {json_path_static}") # Export HTML (timestamped and static) html_path_ts = f"reports/content_snippets_{timestamp}.html" html_path_static = "reports/content_snippets.html" self.generate_content_html(all_content, html_path_ts) self.generate_content_html(all_content, html_path_static) print(f"[OK] Saved HTML: {html_path_ts}") print(f"[OK] Saved HTML: {html_path_static}") print(f"\n{'='*60}") print("[OK] CONTENT EXTRACTION COMPLETE!") print(f"{'='*60}") print(f"\nExtracted:") print(f" - {sum(c['total_paragraphs'] for c in all_content)} paragraphs") print(f" - {sum(c['total_images'] for c in all_content)} images") print(f" - From {len(all_content)} competitor websites") def generate_content_html(self, all_content, output_path): """ Generate HTML report with content snippets """ html = f""" Content Snippets for Rewriting

📝 Content Snippets for Rewriting

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

How to Use This Report

Keywords are highlighted in yellow throughout the text
Read each snippet and understand the competitor's approach
Rewrite in your own words - don't copy! Use as inspiration
Keep the keywords but change the sentence structure and add your unique value
Use images as reference for what visuals to create/source

{sum(c['total_paragraphs'] for c in all_content)}

Total Paragraphs

{sum(c['total_images'] for c in all_content)}

Total Images

{len(all_content)}

Competitor Sites

""" # Add content from each competitor for site in all_content: html += f"""

{site['title']}

Source: {site['url']}

{site['total_paragraphs']} paragraphs | {site['total_images']} images

""" # Add sections for section in site['sections']: if not section['paragraphs']: continue html += f"""

📌 {section['heading']}

""" # Add paragraphs for i, para in enumerate(section['paragraphs'], 1): highlighted_text = self.highlight_keywords_in_text(para['text']) keyword_count = len(para['has_keywords']) is_list = para.get('is_list_item', False) list_class = ' list-item' if is_list else '' html += f"""

{keyword_count} keywords | {para['word_count']} words

{highlighted_text}

""" # Add images for this section if section['images']: html += '

Images in this section:

' for img in section['images']: html += f"""

{img['alt'] or 'No alt text'}

""" html += '

' html += '

' # Close section html += '

' # Close source html += """

""" with open(output_path, 'w', encoding='utf-8') as f: f.write(html) if __name__ == "__main__": print("\n" + "="*60) print("CONTENT SNIPPET EXTRACTOR") print("="*60) print("\nThis tool extracts paragraphs and images from competitor websites") print("with keywords highlighted for easy content rewriting.\n") extractor = ContentExtractor() content = extractor.extract_content_from_competitors() print("\n[REPORTS] Open reports/content_snippets.html to see all content!") print("\nUse the snippets as inspiration to write your own unique content.") print("Keywords are highlighted to help you maintain SEO focus.")