"""
Competitor Content Analyzer
Scrapes and analyzes competitor websites for SEO insights
"""

import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urljoin, urlparse
import time
from collections import Counter
import pandas as pd


class CompetitorAnalyzer:
    def __init__(self, config_file='config.json'):
        """
        Initialize competitor analyzer
        
        Args:
            config_file: Path to configuration JSON file
        """
        with open(config_file, 'r') as f:
            self.config = json.load(f)
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
    
    def search_google(self, keyword, num_results=10):
        """
        Get top-ranking URLs from Google for a keyword
        
        Args:
            keyword: Search query
            num_results: Number of results to return
            
        Returns:
            List of URLs
        """
        # Note: For production, consider using Google Custom Search API
        # This is a simplified version
        print(f"  Searching Google for: '{keyword}'")
        
        # For now, we'll use the competitor URLs from config
        # In production, you'd want to use Google Custom Search API
        return self.config.get('competitors', [])[:num_results]
    
    def scrape_page_content(self, url):
        """
        Scrape content from a webpage
        
        Args:
            url: URL to scrape
            
        Returns:
            Dictionary with page content and metadata
        """
        try:
            print(f"  Scraping: {url}")
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style", "nav", "footer", "header"]):
                script.decompose()
            
            # Extract content
            content = {
                'url': url,
                'title': self._get_title(soup),
                'meta_description': self._get_meta_description(soup),
                'headings': self._extract_headings(soup),
                'body_text': self._get_body_text(soup),
                'word_count': 0,
                'internal_links': self._get_internal_links(soup, url),
                'external_links': self._get_external_links(soup, url),
                'images': len(soup.find_all('img')),
                'schema_markup': self._has_schema(soup)
            }
            
            content['word_count'] = len(content['body_text'].split())
            
            return content
        
        except Exception as e:
            print(f"  [ERROR] Error scraping {url}: {e}")
            return None
    
    def _get_title(self, soup):
        """Extract page title"""
        title_tag = soup.find('title')
        return title_tag.get_text().strip() if title_tag else ''
    
    def _get_meta_description(self, soup):
        """Extract meta description"""
        meta = soup.find('meta', attrs={'name': 'description'})
        return meta.get('content', '').strip() if meta else ''
    
    def _extract_headings(self, soup):
        """Extract all headings with hierarchy"""
        headings = []
        for level in range(1, 7):
            for heading in soup.find_all(f'h{level}'):
                headings.append({
                    'level': level,
                    'text': heading.get_text().strip()
                })
        return headings
    
    def _get_body_text(self, soup):
        """Extract main body text"""
        # Try to find main content area
        main_content = soup.find('main') or soup.find('article') or soup.find('body')
        
        if main_content:
            text = main_content.get_text(separator=' ', strip=True)
            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text)
            return text
        return ''
    
    def _get_internal_links(self, soup, base_url):
        """Extract internal links"""
        domain = urlparse(base_url).netloc
        internal_links = []
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(base_url, href)
            
            if urlparse(full_url).netloc == domain:
                internal_links.append(full_url)
        
        return list(set(internal_links))
    
    def _get_external_links(self, soup, base_url):
        """Extract external links"""
        domain = urlparse(base_url).netloc
        external_links = []
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(base_url, href)
            
            if urlparse(full_url).netloc != domain and full_url.startswith('http'):
                external_links.append(full_url)
        
        return list(set(external_links))
    
    def _has_schema(self, soup):
        """Check for schema.org markup"""
        return bool(soup.find_all(attrs={'itemtype': True}) or 
                   soup.find_all('script', type='application/ld+json'))
    
    def analyze_content_structure(self, content, keyword):
        """
        Analyze content structure and keyword usage
        
        Args:
            content: Content dictionary from scrape_page_content()
            keyword: Target keyword to analyze
            
        Returns:
            Dictionary with analysis results
        """
        if not content:
            return None
        
        text_lower = content['body_text'].lower()
        keyword_lower = keyword.lower()
        
        # Calculate keyword metrics
        keyword_count = text_lower.count(keyword_lower)
        keyword_density = (keyword_count / content['word_count'] * 100) if content['word_count'] > 0 else 0
        
        # Check keyword placement
        title_has_keyword = keyword_lower in content['title'].lower()
        meta_has_keyword = keyword_lower in content['meta_description'].lower()
        h1_has_keyword = any(keyword_lower in h['text'].lower() 
                            for h in content['headings'] if h['level'] == 1)
        
        # Extract keyword variations
        keyword_variations = self._find_keyword_variations(text_lower, keyword_lower)
        
        analysis = {
            'url': content['url'],
            'title': content['title'],
            'word_count': content['word_count'],
            'keyword_count': keyword_count,
            'keyword_density': round(keyword_density, 2),
            'title_has_keyword': title_has_keyword,
            'meta_has_keyword': meta_has_keyword,
            'h1_has_keyword': h1_has_keyword,
            'h1_count': len([h for h in content['headings'] if h['level'] == 1]),
            'h2_count': len([h for h in content['headings'] if h['level'] == 2]),
            'h3_count': len([h for h in content['headings'] if h['level'] == 3]),
            'total_headings': len(content['headings']),
            'internal_links': len(content['internal_links']),
            'external_links': len(content['external_links']),
            'images': content['images'],
            'has_schema': content['schema_markup'],
            'keyword_variations': keyword_variations,
            'headings': content['headings']
        }
        
        return analysis
    
    def _find_keyword_variations(self, text, keyword):
        """Find variations of the keyword in text"""
        # Split keyword into words
        keyword_words = keyword.split()
        
        # Find phrases containing keyword words
        variations = set()
        
        # Look for exact matches and variations
        words = text.split()
        for i in range(len(words)):
            for j in range(i + 1, min(i + 6, len(words) + 1)):
                phrase = ' '.join(words[i:j])
                # If phrase contains any keyword word
                if any(kw in phrase for kw in keyword_words):
                    variations.add(phrase)
        
        # Return most common variations
        return list(variations)[:20]
    
    def calculate_keyword_density(self, text, keyword):
        """Calculate keyword density percentage"""
        text_lower = text.lower()
        keyword_lower = keyword.lower()
        
        word_count = len(text.split())
        keyword_count = text_lower.count(keyword_lower)
        
        if word_count == 0:
            return 0
        
        return (keyword_count / word_count) * 100
    
    def compare_competitors(self, analyses):
        """
        Compare multiple competitor analyses
        
        Args:
            analyses: List of analysis dictionaries
            
        Returns:
            Comparison summary
        """
        if not analyses:
            return {}
        
        valid_analyses = [a for a in analyses if a is not None]
        
        if not valid_analyses:
            return {}
        
        comparison = {
            'total_analyzed': len(valid_analyses),
            'avg_word_count': sum(a['word_count'] for a in valid_analyses) / len(valid_analyses),
            'avg_keyword_density': sum(a['keyword_density'] for a in valid_analyses) / len(valid_analyses),
            'avg_headings': sum(a['total_headings'] for a in valid_analyses) / len(valid_analyses),
            'avg_internal_links': sum(a['internal_links'] for a in valid_analyses) / len(valid_analyses),
            'keyword_in_title_pct': sum(1 for a in valid_analyses if a['title_has_keyword']) / len(valid_analyses) * 100,
            'keyword_in_h1_pct': sum(1 for a in valid_analyses if a['h1_has_keyword']) / len(valid_analyses) * 100,
            'schema_usage_pct': sum(1 for a in valid_analyses if a['has_schema']) / len(valid_analyses) * 100,
            'min_word_count': min(a['word_count'] for a in valid_analyses),
            'max_word_count': max(a['word_count'] for a in valid_analyses),
            'competitors': valid_analyses
        }
        
        return comparison
    
    def scrape_internal_pages(self, base_url, max_pages=20):
        """
        Scrape internal pages from a website
        
        Args:
            base_url: Starting URL
            max_pages: Maximum pages to scrape
            
        Returns:
            List of scraped page data
        """
        print(f"\nScraping internal pages from: {base_url}")
        
        visited = set()
        to_visit = [base_url]
        scraped_pages = []
        
        while to_visit and len(scraped_pages) < max_pages:
            url = to_visit.pop(0)
            
            if url in visited:
                continue
            
            visited.add(url)
            content = self.scrape_page_content(url)
            
            if content:
                scraped_pages.append(content)
                
                # Add internal links to queue
                for link in content['internal_links'][:5]:  # Limit to avoid too many pages
                    if link not in visited and link not in to_visit:
                        to_visit.append(link)
            
            time.sleep(1)  # Be polite
        
        print(f"  ✓ Scraped {len(scraped_pages)} pages")
        return scraped_pages
    
    def extract_all_keywords(self, pages_data):
        """
        Extract all potential keywords from scraped pages
        
        Args:
            pages_data: List of page content dictionaries
            
        Returns:
            List of keyword phrases with frequency
        """
        all_text = ' '.join(page['body_text'] for page in pages_data if page)
        
        # Extract 2-4 word phrases
        words = re.findall(r'\b\w+\b', all_text.lower())
        
        phrases = []
        for n in range(2, 5):  # 2, 3, 4 word phrases
            for i in range(len(words) - n + 1):
                phrase = ' '.join(words[i:i+n])
                phrases.append(phrase)
        
        # Count frequency
        phrase_counts = Counter(phrases)
        
        # Filter out very common/stop phrases
        stop_phrases = {'in the', 'of the', 'to the', 'for the', 'on the', 'at the'}
        
        filtered = [(phrase, count) for phrase, count in phrase_counts.most_common(100)
                   if phrase not in stop_phrases and count > 2]
        
        return filtered


if __name__ == "__main__":
    # Test competitor analyzer
    print("Testing Competitor Analyzer...")
    print("="*60)
    
    analyzer = CompetitorAnalyzer()
    
    # Test with first competitor
    test_url = analyzer.config['competitors'][0]
    
    print(f"\nScraping: {test_url}")
    content = analyzer.scrape_page_content(test_url)
    
    if content:
        print(f"\n✓ Successfully scraped!")
        print(f"  Title: {content['title']}")
        print(f"  Word Count: {content['word_count']}")
        print(f"  Headings: {len(content['headings'])}")
        print(f"  Internal Links: {len(content['internal_links'])}")
        
        # Analyze for a test keyword
        test_keyword = "conservatory roof"
        analysis = analyzer.analyze_content_structure(content, test_keyword)
        
        print(f"\nKeyword Analysis for '{test_keyword}':")
        print(f"  Keyword Density: {analysis['keyword_density']}%")
        print(f"  In Title: {analysis['title_has_keyword']}")
        print(f"  In H1: {analysis['h1_has_keyword']}")