"""
Keyword Research Module
Fetches and analyzes keyword data from Google Search Console
"""

import json
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
import re
from gsc_authenticator import GSCAuthenticator


class KeywordResearcher:
    def __init__(self, config_file='config.json'):
        """
        Initialize keyword researcher
        
        Args:
            config_file: Path to configuration JSON file
        """
        with open(config_file, 'r') as f:
            self.config = json.load(f)
        
        self.auth = GSCAuthenticator()
        self.service = None
        self.data = []
    
    def connect(self):
        """Authenticate and connect to GSC API"""
        self.service = self.auth.authenticate()
        print("✓ Connected to Google Search Console")
    
    def fetch_gsc_data(self, property_url, days=90):
        """
        Fetch search performance data from Google Search Console
        
        Args:
            property_url: URL of the GSC property
            days: Number of days to fetch data for
            
        Returns:
            List of query data dictionaries
        """
        if not self.service:
            self.connect()
        
        end_date = datetime.now().date()
        start_date = end_date - timedelta(days=days)
        
        request = {
            'startDate': start_date.strftime('%Y-%m-%d'),
            'endDate': end_date.strftime('%Y-%m-%d'),
            'dimensions': ['query'],
            'rowLimit': 25000,  # Max allowed by API
            'startRow': 0
        }
        
        print(f"Fetching data from {start_date} to {end_date}...")
        
        try:
            response = self.service.searchanalytics().query(
                siteUrl=property_url, body=request).execute()
            
            rows = response.get('rows', [])
            print(f"✓ Fetched {len(rows)} queries")
            
            return rows
        
        except Exception as e:
            print(f"✗ Error fetching GSC data: {e}")
            return []
    
    def analyze_opportunities(self, gsc_data, min_impressions=10):
        """
        Analyze keyword opportunities from GSC data
        
        Args:
            gsc_data: Raw GSC data from fetch_gsc_data()
            min_impressions: Minimum impressions to consider
            
        Returns:
            DataFrame with opportunity analysis
        """
        opportunities = []
        
        for row in gsc_data:
            query = row['keys'][0]
            clicks = row.get('clicks', 0)
            impressions = row.get('impressions', 0)
            ctr = row.get('ctr', 0) * 100
            position = row.get('position', 0)
            
            if impressions < min_impressions:
                continue
            
            # Calculate opportunity score
            # Higher score = better opportunity
            opportunity_score = 0
            
            # High impressions but low clicks = opportunity
            if impressions > 100 and ctr < 5:
                opportunity_score += (impressions / 10) * (5 - ctr)
            
            # Position 4-20 = easy to improve
            if 4 <= position <= 20:
                opportunity_score += (21 - position) * 10
            
            # Position 11-20 (page 2) = high priority
            if 11 <= position <= 20:
                opportunity_score += 100
            
            opportunities.append({
                'query': query,
                'clicks': clicks,
                'impressions': impressions,
                'ctr': round(ctr, 2),
                'position': round(position, 1),
                'opportunity_score': round(opportunity_score, 2),
                'word_count': len(query.split())
            })
        
        df = pd.DataFrame(opportunities)
        df = df.sort_values('opportunity_score', ascending=False)
        
        return df
    
    def cluster_keywords(self, keywords_df, similarity_threshold=0.6):
        """
        Cluster related keywords by topic
        
        Args:
            keywords_df: DataFrame with keyword data
            similarity_threshold: Minimum similarity to group keywords
            
        Returns:
            Dictionary of keyword clusters
        """
        clusters = defaultdict(list)
        processed = set()
        
        for idx, row in keywords_df.iterrows():
            query = row['query']
            
            if query in processed:
                continue
            
            # Extract main topic words (remove common words)
            words = set(query.lower().split())
            stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 
                         'to', 'for', 'of', 'with', 'by', 'from', 'up', 'about', 
                         'into', 'through', 'during', 'before', 'after', 'above',
                         'below', 'between', 'under', 'again', 'further', 'then',
                         'once', 'here', 'there', 'when', 'where', 'why', 'how',
                         'all', 'both', 'each', 'few', 'more', 'most', 'other',
                         'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
                         'so', 'than', 'too', 'very', 'can', 'will', 'just', 'should'}
            
            topic_words = words - stop_words
            
            if not topic_words:
                continue
            
            # Find cluster key (most significant word combination)
            cluster_key = ' '.join(sorted(list(topic_words))[:3])
            
            clusters[cluster_key].append({
                'query': query,
                'clicks': row['clicks'],
                'impressions': row['impressions'],
                'position': row['position'],
                'opportunity_score': row['opportunity_score']
            })
            
            processed.add(query)
        
        # Sort clusters by total opportunity score
        sorted_clusters = {}
        for key, queries in clusters.items():
            total_score = sum(q['opportunity_score'] for q in queries)
            sorted_clusters[key] = {
                'total_opportunity': total_score,
                'keyword_count': len(queries),
                'keywords': sorted(queries, key=lambda x: x['opportunity_score'], reverse=True)
            }
        
        return dict(sorted(sorted_clusters.items(), 
                          key=lambda x: x[1]['total_opportunity'], 
                          reverse=True))
    
    def export_keyword_report(self, opportunities_df, clusters, output_dir='reports'):
        """
        Export keyword research reports
        
        Args:
            opportunities_df: DataFrame with opportunity analysis
            clusters: Dictionary of keyword clusters
            output_dir: Directory to save reports
        """
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Export opportunities CSV (both versions)
        csv_path_ts = f"{output_dir}/keyword_opportunities_{timestamp}.csv"
        csv_path_static = f"{output_dir}/keyword_opportunities.csv"
        opportunities_df.to_csv(csv_path_ts, index=False)
        opportunities_df.to_csv(csv_path_static, index=False)
        print(f"✓ Saved opportunities: {csv_path_ts}")
        print(f"✓ Saved opportunities: {csv_path_static}")
        
        # Export clusters JSON (both versions)
        json_path_ts = f"{output_dir}/keyword_clusters_{timestamp}.json"
        json_path_static = f"{output_dir}/keyword_clusters.json"
        with open(json_path_ts, 'w') as f:
            json.dump(clusters, f, indent=2)
        with open(json_path_static, 'w') as f:
            json.dump(clusters, f, indent=2)
        print(f"✓ Saved clusters: {json_path_ts}")
        print(f"✓ Saved clusters: {json_path_static}")
        
        # Export summary HTML (both versions)
        html_path_ts = f"{output_dir}/keyword_summary_{timestamp}.html"
        html_path_static = f"{output_dir}/keyword_summary.html"
        self._generate_html_report(opportunities_df, clusters, html_path_ts)
        self._generate_html_report(opportunities_df, clusters, html_path_static)
        print(f"✓ Saved HTML report: {html_path_ts}")
        print(f"✓ Saved HTML report: {html_path_static}")
        
        return {
            'csv': csv_path_ts,
            'csv_static': csv_path_static,
            'json': json_path_ts,
            'json_static': json_path_static,
            'html': html_path_ts,
            'html_static': html_path_static
        }
    
    def _generate_html_report(self, df, clusters, output_path):
        """Generate HTML summary report"""
        html = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Keyword Research Report</title>
            <style>
                body {{ font-family: Arial, sans-serif; margin: 20px; }}
                h1 {{ color: #2c3e50; }}
                h2 {{ color: #34495e; margin-top: 30px; }}
                table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
                th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
                th {{ background-color: #3498db; color: white; }}
                tr:nth-child(even) {{ background-color: #f2f2f2; }}
                .metric {{ display: inline-block; margin: 10px 20px; }}
                .metric-value {{ font-size: 24px; font-weight: bold; color: #3498db; }}
                .cluster {{ background: #ecf0f1; padding: 15px; margin: 10px 0; border-radius: 5px; }}
            </style>
        </head>
        <body>
            <h1>Keyword Research Report</h1>
            <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            
            <h2>Summary</h2>
            <div class="metric">
                <div>Total Keywords</div>
                <div class="metric-value">{len(df)}</div>
            </div>
            <div class="metric">
                <div>Total Clusters</div>
                <div class="metric-value">{len(clusters)}</div>
            </div>
            <div class="metric">
                <div>Total Clicks</div>
                <div class="metric-value">{df['clicks'].sum():,}</div>
            </div>
            <div class="metric">
                <div>Total Impressions</div>
                <div class="metric-value">{df['impressions'].sum():,}</div>
            </div>
            
            <h2>Top 20 Keyword Opportunities</h2>
            {df.head(20).to_html(index=False, classes='data-table')}
            
            <h2>Top 10 Keyword Clusters</h2>
        """
        
        for i, (cluster_name, data) in enumerate(list(clusters.items())[:10], 1):
            html += f"""
            <div class="cluster">
                <h3>{i}. {cluster_name}</h3>
                <p><strong>Total Opportunity Score:</strong> {data['total_opportunity']:.2f}</p>
                <p><strong>Keywords in cluster:</strong> {data['keyword_count']}</p>
                <ul>
            """
            for kw in data['keywords'][:5]:
                html += f"<li>{kw['query']} (Pos: {kw['position']:.1f}, Clicks: {kw['clicks']}, Impr: {kw['impressions']})</li>"
            html += "</ul></div>"
        
        html += """
        </body>
        </html>
        """
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html)


if __name__ == "__main__":
    # Test keyword research
    print("Testing Keyword Research Module...")
    print("="*60)
    
    researcher = KeywordResearcher()
    
    # Get properties
    researcher.connect()
    properties = researcher.auth.list_properties()
    
    if properties:
        print(f"\nSelect a property:")
        for i, prop in enumerate(properties, 1):
            print(f"  {i}. {prop}")
        
        choice = int(input("\nEnter number: ")) - 1
        selected_property = properties[choice]
        
        print(f"\nFetching data for: {selected_property}")
        gsc_data = researcher.fetch_gsc_data(selected_property, days=90)
        
        if gsc_data:
            print("\nAnalyzing opportunities...")
            opportunities = researcher.analyze_opportunities(gsc_data)
            
            print("\nClustering keywords...")
            clusters = researcher.cluster_keywords(opportunities)
            
            print("\nExporting reports...")
            files = researcher.export_keyword_report(opportunities, clusters)
            
            print(f"\n✓ Complete! Check the reports directory.")
    else:
        print("No properties found!")