"""
Competitor Keyword Extractor
Scrapes competitor websites and extracts all keywords and phrases
"""

import json
import time
from competitor_analyzer import CompetitorAnalyzer
from collections import Counter
import re
import pandas as pd
from datetime import datetime


def extract_keywords_from_competitors(config_file='config.json'):
    """
    Scrape all competitor URLs and extract keywords
    """
    print("\n" + "="*60)
    print("COMPETITOR KEYWORD EXTRACTION")
    print("="*60)
    
    with open(config_file, 'r') as f:
        config = json.load(f)
    
    analyzer = CompetitorAnalyzer(config_file)
    competitor_urls = config.get('competitors', [])
    
    print(f"\nFound {len(competitor_urls)} competitor URLs to analyze")
    
    all_keywords = []
    all_pages_data = []
    
    # Scrape each competitor
    for i, url in enumerate(competitor_urls, 1):
        print(f"\n[{i}/{len(competitor_urls)}] Analyzing: {url}")
        
        # Scrape main page
        content = analyzer.scrape_page_content(url)
        
        if content:
            all_pages_data.append(content)
            
            # Extract keywords from this page
            keywords = extract_keywords_from_text(content['body_text'])
            all_keywords.extend(keywords)
            
            print(f"  [OK] Extracted {len(keywords)} keyword phrases")
            
            # Scrape internal pages if enabled
            if config['analysis_settings'].get('scrape_internal_links', False):
                max_pages = config['analysis_settings'].get('max_internal_pages', 5)
                print(f"  Scraping up to {max_pages} internal pages...")
                
                internal_pages = analyzer.scrape_internal_pages(url, max_pages=max_pages)
                
                for page in internal_pages:
                    keywords = extract_keywords_from_text(page['body_text'])
                    all_keywords.extend(keywords)
                
                all_pages_data.extend(internal_pages)
                print(f"  [OK] Scraped {len(internal_pages)} internal pages")
        
        time.sleep(1)  # Be polite
    
    # Count keyword frequency
    print("\n" + "="*60)
    print("ANALYZING KEYWORD FREQUENCY")
    print("="*60)
    
    keyword_counts = Counter(all_keywords)
    
    # Filter and rank keywords
    filtered_keywords = []
    
    for keyword, count in keyword_counts.most_common(500):
        # Skip very short or very long phrases
        word_count = len(keyword.split())
        if word_count < 2 or word_count > 6:
            continue
        
        # Skip if too rare
        if count < 2:
            continue
        
        # Calculate a relevance score
        # More mentions = higher score
        # Longer phrases = slightly higher score (more specific)
        score = count * (1 + (word_count * 0.1))
        
        filtered_keywords.append({
            'keyword': keyword,
            'frequency': count,
            'word_count': word_count,
            'relevance_score': round(score, 2)
        })
    
    # Sort by relevance score
    filtered_keywords.sort(key=lambda x: x['relevance_score'], reverse=True)
    
    print(f"\n[OK] Found {len(filtered_keywords)} relevant keywords")
    
    # Export results
    export_keyword_results(filtered_keywords, all_pages_data)
    
    return filtered_keywords


def extract_keywords_from_text(text):
    """
    Extract keyword phrases from text
    """
    # Clean and lowercase
    text = text.lower()
    
    # Remove special characters but keep spaces and hyphens
    text = re.sub(r'[^a-z0-9\s\-]', ' ', text)
    
    # Split into words
    words = text.split()
    
    # Common stop words to filter
    stop_words = {
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
        'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
        'before', 'after', 'above', 'below', 'between', 'under', 'again',
        'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
        'how', 'all', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
        'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
        'very', 'can', 'will', 'just', 'should', 'now', 'this', 'that', 'these',
        'those', 'are', 'was', 'were', 'been', 'being', 'have', 'has', 'had',
        'do', 'does', 'did', 'doing', 'would', 'could', 'ought', 'i', 'you',
        'he', 'she', 'it', 'we', 'they', 'them', 'their', 'what', 'which',
        'who', 'whom', 'whose', 'if', 'because', 'as', 'until', 'while', 'our',
        'your', 'his', 'her', 'its', 'my', 'me', 'him', 'us', 'out', 'over',
        'get', 'got', 'make', 'made', 'go', 'went', 'come', 'came', 'see',
        'saw', 'know', 'knew', 'take', 'took', 'give', 'gave', 'find', 'found',
        'think', 'thought', 'tell', 'told', 'become', 'became', 'leave', 'left',
        'feel', 'felt', 'put', 'bring', 'brought', 'begin', 'began', 'keep',
        'kept', 'hold', 'held', 'write', 'wrote', 'stand', 'stood', 'hear',
        'heard', 'let', 'mean', 'meant', 'set', 'meet', 'met', 'run', 'ran',
        'pay', 'paid', 'sit', 'sat', 'speak', 'spoke', 'lie', 'lay', 'lead',
        'led', 'read', 'grow', 'grew', 'lose', 'lost', 'fall', 'fell', 'send',
        'sent', 'build', 'built', 'understand', 'understood', 'draw', 'drew',
        'break', 'broke', 'spend', 'spent', 'cut', 'choose', 'chose', 'buy',
        'bought', 'wear', 'wore', 'catch', 'caught', 'teach', 'taught'
    }
    
    keywords = []
    
    # Extract 2-5 word phrases
    for phrase_length in range(2, 6):
        for i in range(len(words) - phrase_length + 1):
            phrase_words = words[i:i+phrase_length]
            
            # Skip if starts or ends with stop word
            if phrase_words[0] in stop_words or phrase_words[-1] in stop_words:
                continue
            
            # Skip if all words are stop words
            if all(w in stop_words for w in phrase_words):
                continue
            
            phrase = ' '.join(phrase_words)
            
            # Skip very short phrases
            if len(phrase) < 8:
                continue
            
            keywords.append(phrase)
    
    return keywords


def export_keyword_results(keywords, pages_data):
    """
    Export keyword results to multiple formats
    """
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Export CSV (both versions)
    df = pd.DataFrame(keywords)
    csv_path_ts = f"reports/competitor_keywords_{timestamp}.csv"
    csv_path_static = f"reports/competitor_keywords.csv"
    df.to_csv(csv_path_ts, index=False)
    df.to_csv(csv_path_static, index=False)
    print(f"\n[OK] Saved CSV: {csv_path_ts}")
    print(f"[OK] Saved CSV: {csv_path_static}")
    
    # Export JSON (both versions)
    json_data = {
        'total_keywords': len(keywords),
        'total_pages_analyzed': len(pages_data),
        'keywords': keywords[:200]  # Top 200
    }
    json_path_ts = f"reports/competitor_keywords_{timestamp}.json"
    json_path_static = f"reports/competitor_keywords.json"
    with open(json_path_ts, 'w') as f:
        json.dump(json_data, f, indent=2)
    with open(json_path_static, 'w') as f:
        json.dump(json_data, f, indent=2)
    print(f"[OK] Saved JSON: {json_path_ts}")
    print(f"[OK] Saved JSON: {json_path_static}")
    
    # Export HTML Report (both versions)
    html_path_ts = f"reports/competitor_keywords_{timestamp}.html"
    html_path_static = f"reports/competitor_keywords.html"
    generate_keyword_html(keywords, pages_data, html_path_ts)
    generate_keyword_html(keywords, pages_data, html_path_static)
    print(f"[OK] Saved HTML: {html_path_ts}")
    print(f"[OK] Saved HTML: {html_path_static}")
    
    print(f"\n{'='*60}")
    print("[OK] EXTRACTION COMPLETE!")
    print(f"{'='*60}")
    print(f"\nTop 10 Keywords by Relevance:")
    for i, kw in enumerate(keywords[:10], 1):
        print(f"  {i}. {kw['keyword']} (mentioned {kw['frequency']}x, score: {kw['relevance_score']})")


def generate_keyword_html(keywords, pages_data, output_path):
    """
    Generate HTML report for competitor keywords
    """
    html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Competitor Keywords Report</title>
        <style>
            body {{ font-family: 'Segoe UI', Arial, sans-serif; margin: 20px; background: #f5f5f5; }}
            .container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 30px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }}
            h1 {{ color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; }}
            h2 {{ color: #34495e; margin-top: 40px; }}
            .summary {{ display: flex; justify-content: space-around; margin: 30px 0; }}
            .metric {{ text-align: center; padding: 20px; background: #ecf0f1; border-radius: 8px; }}
            .metric-value {{ font-size: 36px; font-weight: bold; color: #3498db; }}
            .metric-label {{ color: #7f8c8d; margin-top: 5px; }}
            table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
            th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
            th {{ background-color: #3498db; color: white; position: sticky; top: 0; }}
            tr:nth-child(even) {{ background-color: #f9f9f9; }}
            tr:hover {{ background-color: #e8f4f8; }}
            .keyword {{ font-weight: bold; color: #2c3e50; }}
            .high-score {{ color: #27ae60; }}
            .med-score {{ color: #f39c12; }}
            .low-score {{ color: #95a5a6; }}
        </style>
    </head>
    <body>
        <div class="container">
            <h1>🔍 Competitor Keywords Analysis</h1>
            <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            
            <div class="summary">
                <div class="metric">
                    <div class="metric-value">{len(keywords)}</div>
                    <div class="metric-label">Total Keywords</div>
                </div>
                <div class="metric">
                    <div class="metric-value">{len(pages_data)}</div>
                    <div class="metric-label">Pages Analyzed</div>
                </div>
                <div class="metric">
                    <div class="metric-value">{sum(k['frequency'] for k in keywords)}</div>
                    <div class="metric-label">Total Mentions</div>
                </div>
            </div>
            
            <h2>📊 Top 100 Keywords by Relevance</h2>
            <p>These keywords appear most frequently across competitor websites. Higher relevance score = better opportunity.</p>
            
            <table>
                <thead>
                    <tr>
                        <th>#</th>
                        <th>Keyword</th>
                        <th>Frequency</th>
                        <th>Words</th>
                        <th>Relevance Score</th>
                    </tr>
                </thead>
                <tbody>
    """
    
    for i, kw in enumerate(keywords[:100], 1):
        score_class = 'high-score' if kw['relevance_score'] > 20 else ('med-score' if kw['relevance_score'] > 10 else 'low-score')
        
        html += f"""
                    <tr>
                        <td>{i}</td>
                        <td class="keyword">{kw['keyword']}</td>
                        <td>{kw['frequency']}</td>
                        <td>{kw['word_count']}</td>
                        <td class="{score_class}">{kw['relevance_score']}</td>
                    </tr>
        """
    
    html += """
                </tbody>
            </table>
            
            <h2>💡 How to Use These Keywords</h2>
            <ol>
                <li><strong>High Relevance (Score > 20)</strong>: These are your competitors' main topics. Create comprehensive content around these.</li>
                <li><strong>Medium Relevance (Score 10-20)</strong>: Supporting topics. Use in H2/H3 headings and supporting sections.</li>
                <li><strong>Frequency</strong>: How many times competitors mention this. Higher = more important to them.</li>
                <li><strong>Word Count</strong>: Longer phrases (4-5 words) are more specific and often easier to rank for.</li>
            </ol>
            
            <h2>🎯 Next Steps</h2>
            <ol>
                <li>Pick 5-10 high-relevance keywords</li>
                <li>Search these in Google to see search volume</li>
                <li>Create content targeting these keywords</li>
                <li>Use the exact phrases in your titles, H1s, and content</li>
                <li>Track rankings in Google Search Console</li>
            </ol>
        </div>
    </body>
    </html>
    """
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html)


if __name__ == "__main__":
    keywords = extract_keywords_from_competitors()
    
    print(f"\n[REPORTS] Check the reports/ folder for detailed results!")
    print(f"\nYou now have a list of keywords your competitors are targeting.")
    print(f"Use these to create content and improve your SEO!")