"""
Content Snippet Extractor
Extracts text paragraphs and images from competitor websites
Highlights keywords for content rewriting
"""

import json
import re
from competitor_analyzer import CompetitorAnalyzer
from datetime import datetime
from collections import defaultdict


class ContentExtractor:
    def __init__(self, config_file='config.json'):
        """Initialize content extractor"""
        with open(config_file, 'r') as f:
            self.config = json.load(f)
        
        self.analyzer = CompetitorAnalyzer(config_file)
        
        # Load keywords from multiple sources
        self.keywords = []
        
        # Load GSC keywords from CSV
        try:
            import pandas as pd
            csv_files = [
                'reports/keyword_opportunities.csv',
                'reports/keyword_opportunities_20260109_011956.csv'
            ]
            for csv_file in csv_files:
                try:
                    df = pd.read_csv(csv_file)
                    gsc_keywords = df['query'].tolist()
                    self.keywords.extend(gsc_keywords)
                    print(f"[OK] Loaded {len(gsc_keywords)} keywords from {csv_file}")
                    break
                except:
                    continue
        except Exception as e:
            print(f"[WARNING] Could not load GSC keywords: {e}")
        
        # Load competitor keywords from JSON
        try:
            with open('reports/competitor_keywords.json', 'r') as f:
                keyword_data = json.load(f)
                comp_keywords = [kw['keyword'] for kw in keyword_data['keywords'][:100]]
                self.keywords.extend(comp_keywords)
                print(f"[OK] Loaded {len(comp_keywords)} competitor keywords")
        except:
            print("[WARNING] No competitor_keywords.json found.")
        
        # Remove duplicates and keep unique keywords
        self.keywords = list(set(self.keywords))
        print(f"[OK] Total unique keywords: {len(self.keywords)}")
    
    def extract_content_from_competitors(self):
        """
        Extract all content snippets from competitor websites
        """
        print("\n" + "="*60)
        print("CONTENT SNIPPET EXTRACTION")
        print("="*60)
        
        competitor_urls = self.config.get('competitors', [])
        all_content = []
        
        for i, url in enumerate(competitor_urls, 1):
            print(f"\n[{i}/{len(competitor_urls)}] Extracting from: {url}")
            
            content = self.analyzer.scrape_page_content(url)
            
            if content:
                # Extract paragraphs
                paragraphs = self.extract_paragraphs(content)
                
                # Extract images
                images = self.extract_images(url)
                
                # Organize by headings
                structured_content = self.structure_content(url, content, paragraphs, images)
                
                all_content.append(structured_content)
                
                print(f"  [OK] Extracted {len(paragraphs)} paragraphs, {len(images)} images")
        
        # Export results
        self.export_content_snippets(all_content)
        
        return all_content
    
    def extract_paragraphs(self, content):
        """
        Extract meaningful paragraphs from page content
        """
        from bs4 import BeautifulSoup
        
        # Re-scrape to get structured HTML
        try:
            response = self.analyzer.session.get(content['url'], timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove unwanted elements
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                element.decompose()
            
            paragraphs = []
            
            # Extract all <p> tags
            for p in soup.find_all('p'):
                text = p.get_text().strip()
                
                # Skip short or empty paragraphs
                if len(text) < 50:
                    continue
                
                # Skip navigation/footer text
                if any(skip in text.lower() for skip in ['cookie', 'privacy policy', 'terms and conditions', 'all rights reserved']):
                    continue
                
                paragraphs.append({
                    'text': text,
                    'word_count': len(text.split()),
                    'has_keywords': self.count_keywords_in_text(text)
                })
            
            # Also extract list items that are substantial
            for ul in soup.find_all(['ul', 'ol']):
                for li in ul.find_all('li'):
                    text = li.get_text().strip()
                    if len(text) > 30:
                        paragraphs.append({
                            'text': text,
                            'word_count': len(text.split()),
                            'has_keywords': self.count_keywords_in_text(text),
                            'is_list_item': True
                        })
            
            return paragraphs
        
        except Exception as e:
            print(f"  [ERROR] Error extracting paragraphs: {e}")
            return []
    
    def extract_images(self, url):
        """
        Extract all image URLs from a page
        """
        from bs4 import BeautifulSoup
        from urllib.parse import urljoin
        
        try:
            response = self.analyzer.session.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            images = []
            
            for img in soup.find_all('img'):
                src = img.get('src') or img.get('data-src')
                if src:
                    # Convert relative URLs to absolute
                    full_url = urljoin(url, src)
                    
                    # Skip tiny images (likely icons/logos)
                    width = img.get('width')
                    height = img.get('height')
                    
                    if width and height:
                        try:
                            if int(width) < 100 or int(height) < 100:
                                continue
                        except:
                            pass
                    
                    images.append({
                        'url': full_url,
                        'alt': img.get('alt', ''),
                        'title': img.get('title', '')
                    })
            
            return images
        
        except Exception as e:
            print(f"  [ERROR] Error extracting images: {e}")
            return []
    
    def structure_content(self, url, page_content, paragraphs, images):
        """
        Organize content by headings and topics
        """
        sections = []
        current_section = {
            'heading': 'Introduction',
            'level': 1,
            'paragraphs': [],
            'images': []
        }
        
        # Group paragraphs by headings
        for heading in page_content['headings']:
            if heading['level'] <= 2:  # H1 and H2
                # Save previous section
                if current_section['paragraphs']:
                    sections.append(current_section)
                
                # Start new section
                current_section = {
                    'heading': heading['text'],
                    'level': heading['level'],
                    'paragraphs': [],
                    'images': []
                }
        
        # Distribute paragraphs across sections
        # For simplicity, we'll just add all paragraphs to sections evenly
        if sections:
            para_per_section = len(paragraphs) // len(sections) if sections else len(paragraphs)
            for i, section in enumerate(sections):
                start_idx = i * para_per_section
                end_idx = start_idx + para_per_section
                section['paragraphs'] = paragraphs[start_idx:end_idx]
        else:
            current_section['paragraphs'] = paragraphs
            sections.append(current_section)
        
        # Distribute images
        if images and sections:
            img_per_section = len(images) // len(sections) if sections else len(images)
            for i, section in enumerate(sections):
                start_idx = i * img_per_section
                end_idx = start_idx + img_per_section
                section['images'] = images[start_idx:end_idx]
        
        return {
            'url': url,
            'title': page_content['title'],
            'sections': sections,
            'total_paragraphs': len(paragraphs),
            'total_images': len(images)
        }
    
    def count_keywords_in_text(self, text):
        """
        Count how many keywords appear in text
        """
        text_lower = text.lower()
        found_keywords = []
        
        for keyword in self.keywords[:50]:  # Check top 50 keywords
            if keyword.lower() in text_lower:
                found_keywords.append(keyword)
        
        return found_keywords
    
    def highlight_keywords_in_text(self, text):
        """
        Return text with keywords wrapped in <mark> tags
        """
        highlighted = text
        
        # Sort keywords by length (longest first) to avoid partial replacements
        sorted_keywords = sorted(self.keywords[:50], key=len, reverse=True)
        
        for keyword in sorted_keywords:
            # Case-insensitive replacement
            pattern = re.compile(re.escape(keyword), re.IGNORECASE)
            highlighted = pattern.sub(f'<mark>{keyword}</mark>', highlighted)
        
        return highlighted
    
    def export_content_snippets(self, all_content):
        """
        Export content snippets to HTML and JSON
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Export JSON (timestamped and static)
        json_path_ts = f"reports/content_snippets_{timestamp}.json"
        json_path_static = "reports/content_snippets.json"
        
        json_data = {
            'extraction_date': datetime.now().isoformat(),
            'total_sites': len(all_content),
            'total_paragraphs': sum(c['total_paragraphs'] for c in all_content),
            'total_images': sum(c['total_images'] for c in all_content),
            'content': all_content
        }
        
        with open(json_path_ts, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2)
        with open(json_path_static, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2)
        
        print(f"\n[OK] Saved JSON: {json_path_ts}")
        print(f"[OK] Saved JSON: {json_path_static}")
        
        # Export HTML (timestamped and static)
        html_path_ts = f"reports/content_snippets_{timestamp}.html"
        html_path_static = "reports/content_snippets.html"
        
        self.generate_content_html(all_content, html_path_ts)
        self.generate_content_html(all_content, html_path_static)
        
        print(f"[OK] Saved HTML: {html_path_ts}")
        print(f"[OK] Saved HTML: {html_path_static}")
        
        print(f"\n{'='*60}")
        print("[OK] CONTENT EXTRACTION COMPLETE!")
        print(f"{'='*60}")
        print(f"\nExtracted:")
        print(f"  - {sum(c['total_paragraphs'] for c in all_content)} paragraphs")
        print(f"  - {sum(c['total_images'] for c in all_content)} images")
        print(f"  - From {len(all_content)} competitor websites")
    
    def generate_content_html(self, all_content, output_path):
        """
        Generate HTML report with content snippets
        """
        html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Content Snippets for Rewriting</title>
            <style>
                body {{ font-family: 'Segoe UI', Arial, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }}
                .container {{ max-width: 1400px; margin: 0 auto; background: white; padding: 30px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }}
                h1 {{ color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }}
                h2 {{ color: #34495e; margin-top: 40px; background: #ecf0f1; padding: 15px; border-left: 4px solid #3498db; }}
                h3 {{ color: #7f8c8d; margin-top: 30px; }}
                .summary {{ display: flex; justify-content: space-around; margin: 30px 0; }}
                .metric {{ text-align: center; padding: 20px; background: #ecf0f1; border-radius: 8px; }}
                .metric-value {{ font-size: 36px; font-weight: bold; color: #3498db; }}
                .metric-label {{ color: #7f8c8d; margin-top: 5px; }}
                .source {{ background: #fff; border: 2px solid #3498db; padding: 20px; margin: 20px 0; border-radius: 8px; }}
                .source-url {{ color: #3498db; font-size: 14px; word-break: break-all; }}
                .section {{ background: #f9f9f9; padding: 20px; margin: 15px 0; border-radius: 5px; border-left: 3px solid #27ae60; }}
                .paragraph {{ background: white; padding: 15px; margin: 10px 0; border-radius: 5px; border: 1px solid #ddd; line-height: 1.8; }}
                .paragraph:hover {{ box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
                mark {{ background: #fff3cd; padding: 2px 4px; border-radius: 3px; font-weight: bold; }}
                .keyword-count {{ color: #27ae60; font-weight: bold; font-size: 12px; }}
                .word-count {{ color: #7f8c8d; font-size: 12px; }}
                .images {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); gap: 15px; margin: 20px 0; }}
                .image-card {{ background: white; padding: 10px; border: 1px solid #ddd; border-radius: 5px; }}
                .image-card img {{ width: 100%; height: 150px; object-fit: cover; border-radius: 3px; }}
                .image-url {{ font-size: 11px; color: #7f8c8d; word-break: break-all; margin-top: 5px; }}
                .copy-btn {{ background: #3498db; color: white; border: none; padding: 8px 15px; border-radius: 5px; cursor: pointer; margin-top: 10px; }}
                .copy-btn:hover {{ background: #2980b9; }}
                .list-item {{ border-left: 3px solid #f39c12; padding-left: 15px; }}
                .instructions {{ background: #d5f4e6; border-left: 4px solid #27ae60; padding: 20px; margin: 20px 0; }}
            </style>
        </head>
        <body>
            <div class="container">
                <h1>📝 Content Snippets for Rewriting</h1>
                <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
                
                <div class="instructions">
                    <h3>How to Use This Report</h3>
                    <ol>
                        <li><strong>Keywords are highlighted</strong> in yellow throughout the text</li>
                        <li><strong>Read each snippet</strong> and understand the competitor's approach</li>
                        <li><strong>Rewrite in your own words</strong> - don't copy! Use as inspiration</li>
                        <li><strong>Keep the keywords</strong> but change the sentence structure and add your unique value</li>
                        <li><strong>Use images</strong> as reference for what visuals to create/source</li>
                    </ol>
                </div>
                
                <div class="summary">
                    <div class="metric">
                        <div class="metric-value">{sum(c['total_paragraphs'] for c in all_content)}</div>
                        <div class="metric-label">Total Paragraphs</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value">{sum(c['total_images'] for c in all_content)}</div>
                        <div class="metric-label">Total Images</div>
                    </div>
                    <div class="metric">
                        <div class="metric-value">{len(all_content)}</div>
                        <div class="metric-label">Competitor Sites</div>
                    </div>
                </div>
        """
        
        # Add content from each competitor
        for site in all_content:
            html += f"""
                <div class="source">
                    <h2>{site['title']}</h2>
                    <p class="source-url">Source: <a href="{site['url']}" target="_blank">{site['url']}</a></p>
                    <p><strong>{site['total_paragraphs']} paragraphs</strong> | <strong>{site['total_images']} images</strong></p>
            """
            
            # Add sections
            for section in site['sections']:
                if not section['paragraphs']:
                    continue
                
                html += f"""
                    <div class="section">
                        <h3>📌 {section['heading']}</h3>
                """
                
                # Add paragraphs
                for i, para in enumerate(section['paragraphs'], 1):
                    highlighted_text = self.highlight_keywords_in_text(para['text'])
                    keyword_count = len(para['has_keywords'])
                    is_list = para.get('is_list_item', False)
                    list_class = ' list-item' if is_list else ''
                    
                    html += f"""
                        <div class="paragraph{list_class}">
                            <div>
                                <span class="keyword-count">{keyword_count} keywords</span> | 
                                <span class="word-count">{para['word_count']} words</span>
                            </div>
                            <p>{highlighted_text}</p>
                            <button class="copy-btn" onclick="copyText(this)">Copy Plain Text</button>
                            <textarea style="display:none;">{para['text']}</textarea>
                        </div>
                    """
                
                # Add images for this section
                if section['images']:
                    html += '<h4>Images in this section:</h4><div class="images">'
                    for img in section['images']:
                        html += f"""
                            <div class="image-card">
                                <img src="{img['url']}" alt="{img['alt']}" onerror="this.src='data:image/svg+xml,%3Csvg xmlns=%22http://www.w3.org/2000/svg%22 width=%22200%22 height=%22150%22%3E%3Crect fill=%22%23ddd%22 width=%22200%22 height=%22150%22/%3E%3Ctext x=%2250%25%22 y=%2250%25%22 text-anchor=%22middle%22 fill=%22%23999%22%3EImage not available%3C/text%3E%3C/svg%3E'">
                                <div class="image-url">{img['alt'] or 'No alt text'}</div>
                                <button class="copy-btn" onclick="copyText(this)">Copy URL</button>
                                <textarea style="display:none;">{img['url']}</textarea>
                            </div>
                        """
                    html += '</div>'
                
                html += '</div>'  # Close section
            
            html += '</div>'  # Close source
        
        html += """
                <script>
                    function copyText(button) {
                        const textarea = button.nextElementSibling;
                        textarea.style.display = 'block';
                        textarea.select();
                        document.execCommand('copy');
                        textarea.style.display = 'none';
                        
                        button.textContent = 'Copied!';
                        setTimeout(() => {
                            button.textContent = button.previousElementSibling ? 'Copy URL' : 'Copy Plain Text';
                        }, 2000);
                    }
                </script>
            </div>
        </body>
        </html>
        """
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)


if __name__ == "__main__":
    print("\n" + "="*60)
    print("CONTENT SNIPPET EXTRACTOR")
    print("="*60)
    print("\nThis tool extracts paragraphs and images from competitor websites")
    print("with keywords highlighted for easy content rewriting.\n")
    
    extractor = ContentExtractor()
    content = extractor.extract_content_from_competitors()
    
    print("\n[REPORTS] Open reports/content_snippets.html to see all content!")
    print("\nUse the snippets as inspiration to write your own unique content.")
    print("Keywords are highlighted to help you maintain SEO focus.")
