
    $``i6                     B   S r SSKrSSKJr  SSKrSSKrSSKJrJr  SSK	r	SSK
Jr  SSKr " S S5      r\S:X  a  \" S	5        \" S
5        \" 5       r\R$                  S   S   r\" S\ 35        \R)                  \5      r\(       a  \" S5        \" S\S    35        \" S\S    35        \" S\" \S   5       35        \" S\" \S   5       35        Sr\R1                  \\5      r\" S\ S35        \" S\S    S35        \" S\S    35        \" S\S    35        ggg) zW
Competitor Content Analyzer
Scrapes and analyzes competitor websites for SEO insights
    N)BeautifulSoup)urljoinurlparse)Counterc                       \ rS rSrSS jrSS jrS rS rS rS r	S r
S	 rS
 rS rS rS rS rS rSS jrS rSrg)CompetitorAnalyzer   c                    [        US5       n[        R                  " U5      U l        SSS5        [        R
                  " 5       U l        U R                  R                  R                  SS05        g! , (       d  f       NP= f)zX
Initialize competitor analyzer

Args:
    config_file: Path to configuration JSON file
rNz
User-Agentz<Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)	openjsonloadconfigrequestsSessionsessionheadersupdate)selfconfig_filefs      QC:\Users\lafte\Apps\Conservatory Transformations\seo_tools\competitor_analyzer.py__init__CompetitorAnalyzer.__init__   sb     +s#q))A,DK $  '')##X%
 		 $#s   A33
Bc                 ^    [        SU S35        U R                  R                  S/ 5      SU $ )z
Get top-ranking URLs from Google for a keyword

Args:
    keyword: Search query
    num_results: Number of results to return
    
Returns:
    List of URLs
z  Searching Google for: ''competitorsN)printr   get)r   keywordnum_resultss      r   search_google CompetitorAnalyzer.search_google    s4     	)'!45 {{}b1,;??    c                     [        SU 35        U R                  R                  USS9nUR                  5         [	        UR
                  S5      nU" / SQ5       H  nUR                  5         M     UU R                  U5      U R                  U5      U R                  U5      U R                  U5      SU R                  X15      U R                  X15      [        UR                  S5      5      U R                  U5      S.
n[        US	   R!                  5       5      US
'   U$ ! ["         a  n[        SU SU 35         SnAgSnAff = f)zy
Scrape content from a webpage

Args:
    url: URL to scrape
    
Returns:
    Dictionary with page content and metadata
z  Scraping: 
   )timeoutzhtml.parser)scriptstylenavfooterheaderr   img)
urltitlemeta_descriptionheadings	body_text
word_countinternal_linksexternal_linksimagesschema_markupr2   r3   z  [ERROR] Error scraping z: N)r   r   r   raise_for_statusr   content	decompose
_get_title_get_meta_description_extract_headings_get_body_text_get_internal_links_get_external_linkslenfind_all_has_schemasplit	Exception)r   r.   responsesoupr(   r9   es          r   scrape_page_content&CompetitorAnalyzer.scrape_page_content3   s9   	L&'||''R'8H%%' !1!1=AD MN  " O
 .$($>$>t$D 2248!006"&":":4"E"&":":4"EdmmE23!%!1!1$!7G %((<(B(B(D$EGL!N 	-cU"QC89	s   DD   
E*E  Ec                 r    UR                  S5      nU(       a  UR                  5       R                  5       $ S$ )zExtract page titler/    )findget_textstrip)r   rG   	title_tags      r   r;   CompetitorAnalyzer._get_title^   s/    IIg&	/8y!!#))+@b@r$   c                 x    UR                  SSS0S9nU(       a   UR                  SS5      R                  5       $ S$ )zExtract meta descriptionmetanamedescriptionattrsr9   rL   )rM   r   rO   )r   rG   rS   s      r   r<   (CompetitorAnalyzer._get_meta_descriptionc   s:    yy'>y?26txx	2&,,.>B>r$   c                     / n[        SS5       HN  nUR                  SU 35       H3  nUR                  UUR                  5       R	                  5       S.5        M5     MP     U$ )z#Extract all headings with hierarchy      h)leveltext)rangerB   appendrN   rO   )r   rG   r1   r]   headings        r   r=   $CompetitorAnalyzer._extract_headingsh   s`    1a[E==1UG5"#,,.446!  6 ! r$   c                     UR                  S5      =(       d)    UR                  S5      =(       d    UR                  S5      nU(       a*  UR                  SSS9n[        R                  " SSU5      nU$ g)	zExtract main body textmainarticlebody T)	separatorrO   z\s+rL   )rM   rN   resub)r   rG   main_contentr^   s       r   r>   !CompetitorAnalyzer._get_body_texts   s`     yy(UDIIi,@UDIIfDU((3d(CD66&#t,DKr$   c                     [        U5      R                  n/ nUR                  SSS9 H?  nUS   n[        X&5      n[        U5      R                  U:X  d  M.  UR	                  U5        MA     [        [        U5      5      $ )zExtract internal linksaThrefrp   )r   netlocrB   r   r`   listset)r   rG   base_urldomainr4   linkrp   full_urls           r   r?   &CompetitorAnalyzer._get_internal_links   st    (#**MM#DM1D<Dx.H!((F2%%h/ 2 C'((r$   c                 .   [        U5      R                  n/ nUR                  SSS9 HW  nUS   n[        X&5      n[        U5      R                  U:w  d  M.  UR	                  S5      (       d  MF  UR                  U5        MY     [        [        U5      5      $ )zExtract external linksrn   Tro   rp   http)r   rq   rB   r   
startswithr`   rr   rs   )r   rG   rt   ru   r5   rv   rp   rw   s           r   r@   &CompetitorAnalyzer._get_external_links   s    (#**MM#DM1D<Dx.H!((F2x7J7J67R7R%%h/ 2 C'((r$   c                 d    [        UR                  SS0S9=(       d    UR                  SSS95      $ )zCheck for schema.org markupitemtypeTrV   r(   zapplication/ld+json)type)boolrB   )r   rG   s     r   rC   CompetitorAnalyzer._has_schema   s:    DMMT(:M; G==0E=FH 	Hr$   c           	      P  ^ U(       d  gUS   R                  5       nUR                  5       mUR                  T5      nUS   S:  a
  XAS   -  S-  OSnTUS   R                  5       ;   nTUS   R                  5       ;   n[        U4S jUS	    5       5      nU R                  UT5      n	0 S
US
   _SUS   _SUS   _SU_S[	        US5      _SU_SU_SU_S[        US	    V
s/ s H  oS   S:X  d  M  U
PM     sn
5      _S[        US	    V
s/ s H  oS   S:X  d  M  U
PM     sn
5      _S[        US	    V
s/ s H  oS   S:X  d  M  U
PM     sn
5      _S[        US	   5      _S[        US   5      _S[        US   5      _SUS   _SUS   _SU	_S	US	   0EnU$ s  sn
f s  sn
f s  sn
f )z
Analyze content structure and keyword usage

Args:
    content: Content dictionary from scrape_page_content()
    keyword: Target keyword to analyze
    
Returns:
    Dictionary with analysis results
Nr2   r3   r   d   r/   r0   c              3   d   >#    U  H%  oS    S:X  d  M  TUS   R                  5       ;   v   M'     g7f)r]   rZ   r^   N)lower).0r\   keyword_lowers     r   	<genexpr>?CompetitorAnalyzer.analyze_content_structure.<locals>.<genexpr>   s4      M%8gJ!O @]aioo.??%8s   00r1   r.   keyword_countkeyword_density   title_has_keywordmeta_has_keywordh1_has_keywordh1_countr]   rZ   h2_counth3_count   total_headingsr4   r5   r6   
has_schemar7   keyword_variations)r   countany_find_keyword_variationsroundrA   )r   r9   r    
text_lowerr   r   r   r   r   r   r\   analysisr   s               @r   analyze_content_structure,CompetitorAnalyzer.analyze_content_structure   se    [)//1
 #((7KRS_K`cdKd=<+@@3Fjk *WW-=-C-C-EE(G4F,G,M,M,OO M%,Z%8M M ":::}U
75>
WW%
 ',/
 ]	

 u_a8
  !2
  0
 n
 
(;O(;1zQQ(;OP
 
(;O(;1zQQ(;OP
 
(;O(;1zQQ(;OP
 c'*"56
 c'*:";<
 c'*:";<
 gh'
  '/2!
" !"4#
$ 
+%
*  POOs$   "F2FFF6F#F#c           
        ^ UR                  5       n[        5       nUR                  5       n[        [        U5      5       Hr  n[        US-   [	        US-   [        U5      S-   5      5       HC  nSR                  XVU 5      m[        U4S jU 5       5      (       d  M2  UR                  T5        ME     Mt     [        U5      SS $ )z&Find variations of the keyword in textrZ      rg   c              3   ,   >#    U  H	  oT;   v   M     g 7f)N )r   kwphrases     r   r   >CompetitorAnalyzer._find_keyword_variations.<locals>.<genexpr>   s     <mV|ms   N   )	rD   rs   r_   rA   minjoinr   addrr   )	r   r^   r    keyword_words
variationswordsijr   s	           @r   r   +CompetitorAnalyzer._find_keyword_variations   s       U
 

s5z"A1q5#a!eSZ!^"<=%!*-<m<<<NN6*	 > # J$$r$   c                     UR                  5       nUR                  5       n[        UR                  5       5      nUR                  U5      nUS:X  a  gXe-  S-  $ )z$Calculate keyword density percentager   r   )r   rA   rD   r   )r   r^   r    r   r   r3   r   s          r   calculate_keyword_density,CompetitorAnalyzer.calculate_keyword_density   sP    ZZ\
&
"((7?*c11r$   c                 h   U(       d  0 $ U Vs/ s H	  o"c  M  UPM     nnU(       d  0 $ [        U5      [        S U 5       5      [        U5      -  [        S U 5       5      [        U5      -  [        S U 5       5      [        U5      -  [        S U 5       5      [        U5      -  [        S U 5       5      [        U5      -  S-  [        S U 5       5      [        U5      -  S-  [        S U 5       5      [        U5      -  S-  [        S	 U 5       5      [        S
 U 5       5      US.nU$ s  snf )z~
Compare multiple competitor analyses

Args:
    analyses: List of analysis dictionaries
    
Returns:
    Comparison summary
c              3   *   #    U  H	  oS    v   M     g7fr3   Nr   r   rn   s     r   r   9CompetitorAnalyzer.compare_competitors.<locals>.<genexpr>
       !J>aL/>   c              3   *   #    U  H	  oS    v   M     g7f)r   Nr   r   s     r   r   r     s     &T^):';^r   c              3   *   #    U  H	  oS    v   M     g7f)r   Nr   r   s     r   r   r     s     L^"2 3^r   c              3   *   #    U  H	  oS    v   M     g7f)r4   Nr   r   s     r   r   r     s     %R>a(8&9>r   c              3   <   #    U  H  oS    (       d  M  Sv   M     g7f)r   rZ   Nr   r   s     r   r   r     s     '\>aGZE[>   	r   c              3   <   #    U  H  oS    (       d  M  Sv   M     g7f)r   rZ   Nr   r   s     r   r   r     s     $V1DTBUQQr   c              3   <   #    U  H  oS    (       d  M  Sv   M     g7f)r   rZ   Nr   r   s     r   r   r     s     #Q~!<AA~r   c              3   *   #    U  H	  oS    v   M     g7fr   r   r   s     r   r   r     r   r   c              3   *   #    U  H	  oS    v   M     g7fr   r   r   s     r   r   r     r   r   )total_analyzedavg_word_countavg_keyword_densityavg_headingsavg_internal_linkskeyword_in_title_pctkeyword_in_h1_pctschema_usage_pctmin_word_countmax_word_countr   )rA   sumr   max)r   analysesrn   valid_analyses
comparisons        r   compare_competitors&CompetitorAnalyzer.compare_competitors   s,    I%-?X!X?I ".1!!J>!JJSQ_M``#&&T^&T#TWZ[iWj#jL^LLsSaObb"%%R>%R"RUXYgUh"h$''\>'\$\_bcq_r$rux$x!$$V$V!VY\]kYl!lor!r ##Q~#Q QTWXfTg gjm m!!J>!JJ!!J>!JJ)

 ' @s
   D/D/c                    [        SU 35        [        5       nU/n/ nU(       a  [        U5      U:  a  UR                  S5      nXc;   a  M.  UR	                  U5        U R                  U5      nU(       a?  UR                  U5        US   SS  H"  nX;  d  M
  X;  d  M  UR                  U5        M$     [        R                  " S5        U(       a  [        U5      U:  a  M  [        S[        U5       S35        U$ )	z
Scrape internal pages from a website

Args:
    base_url: Starting URL
    max_pages: Maximum pages to scrape
    
Returns:
    List of scraped page data
z
Scraping internal pages from: r   r4   N   rZ   u     ✓ Scraped z pages)	r   rs   rA   popr   rI   r`   timesleep)	r   rt   	max_pagesvisitedto_visitscraped_pagesr.   r9   rv   s	            r   scrape_internal_pages(CompetitorAnalyzer.scrape_internal_pages  s     	0
;<%:3}-	9,,q/C~KK..s3G$$W- $$45bq9D*t/C - : JJqM# 3}-	9& 	s=12&9:r$   c                    SR                  S U 5       5      n[        R                  " SUR                  5       5      n/ n[	        SS5       HJ  n[	        [        U5      U-
  S-   5       H)  nSR                  X6Xe-    5      nUR                  U5        M+     ML     [        U5      n1 Skn	UR                  S5       VV
s/ s H  u  pzXy;  d  M  U
S:  d  M  Xz4PM     nnn
U$ s  sn
nf )	z
Extract all potential keywords from scraped pages

Args:
    pages_data: List of page content dictionaries
    
Returns:
    List of keyword phrases with frequency
rg   c              3   <   #    U  H  o(       d  M  US    v   M     g7f)r2   Nr   )r   pages     r   r   :CompetitorAnalyzer.extract_all_keywords.<locals>.<genexpr>I  s     M*$-D-*s   
z\b\w+\br   r   rZ   >   at thein theof theon theto thefor ther   )	r   ri   findallr   r_   rA   r`   r   most_common)r   
pages_dataall_textr   phrasesnr   r   phrase_countsstop_phrasesr   filtereds               r   extract_all_keywords'CompetitorAnalyzer.extract_all_keywords?  s     88M*MM 

:x~~'78q!A3u:>A-.%!#,/v& /   ( U9F9R9RSV9W @9W0 $5:QY $VO9W @ @s   =C C C )r   r   N)zconfig.json)r&   )r   )__name__
__module____qualname____firstlineno__r   r"   rI   r;   r<   r=   r>   r?   r@   rC   r   r   r   r   r   r   __static_attributes__r   r$   r   r   r      sY    @&)VA
?
	
))H
3j%(2 D%Nr$   r   __main__zTesting Competitor Analyzer...z<============================================================r   z
Scraping: u   
✓ Successfully scraped!z	  Title: r/   z  Word Count: r3   z  Headings: r1   z  Internal Links: r4   zconservatory roofz
Keyword Analysis for 'z':z  Keyword Density: r   %z  In Title: r   z	  In H1: r   )__doc__r   bs4r   r   ri   urllib.parser   r   r   collectionsr   pandaspdr   r   r   analyzerr   test_urlrI   r9   rA   test_keywordr   r   r   r$   r   <module>r     sm  
    	 *   M M`
 z	
*+	&M!#H }-a0H	L

#$**84G+-	''*+,-w|4567S!45678"3w/?'@#A"BCD +55g|L(b9:#H->$?#@BCX&9:;<=	(#34567  r$   