import requests
import json
import pandas as pd
import time
from datetime import datetime, timedelta
import logging
from collections import defaultdict
import os
import re
from calendar import monthrange

class DigitalAnalystScraper:
    def __init__(self):
        self.historical_url = "<https://historical.api.jobtechdev.se>"
        self.headers = {
            "accept": "application/json"
        }
        self.all_jobs = []
        self.stats = defaultdict(int)
        self.failed_chunks = []
        
        # Setup logging
        log_filename = f'digital_analyst_scraper_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
        logging.basicConfig(
            level=logging.INFO, 
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.StreamHandler(),
                logging.FileHandler(log_filename)
            ]
        )
        self.logger = logging.getLogger(__name__)
        self.logger.info(f"๐Ÿ“‹ Log file: {log_filename}")
        
        # Search terms - focused on digital analyst roles
        self.search_terms = [
            'web analyst',
            'webbanalytiker',
            'webb analytiker', 
            'webbstrateg',
            'digital analyst',
            'digitalanalytiker',
            'digital analytiker',
            'marketing analyst',
            'digital strateg',
            'digital strategy',
            'marknadsanalytiker',
            'konverteringsoptimerare',
            'konverteringsanalytiker',
            'conversion specialist',
            'cro analyst',
            'cro konsult',
            'seo',
            'sem',
            'paid search',
            'sรถkspecialist',
            'gtm',
            'e-commerce analyst',
            'ecommerce analyst',
            'google analytics',
            'marknadsundersรถkare',
            'growth analyst',
            'performance marketing',
            'marketing automation'
        ]
        
        # Keywords that must be in title (case insensitive)
        self.include_keywords = [
            'web analyst',
            'webbanalytiker',
            'webb analytiker', 
            'webbstrateg',
            'digital analyst',
            'digitalanalytiker',
            'digital analytiker',
            'marketing analyst',
            'digital strateg',
            'digital strategy',
            'marknadsanalytiker',
            'konverteringsoptimerare',
            'konverteringsanalytiker',
            'conversion specialist',
            'cro analyst',
            'cro konsult',
            'seo',
            'sem',
            'paid search',
            'sรถkspecialist',
            'gtm',
            'e-commerce analyst',
            'ecommerce analyst',
            'google analytics',
            'marknadsundersรถkare',
            'growth analyst',
            'performance marketing',
            'marketing automation'
        ]
        
        # Keywords that exclude the job (case insensitive)
        self.exclude_keywords = [
            'utvecklare',
            'developer',
            'sem vik'
        ]
        
        # Generate all months from 2016 to now
        self.months = self.generate_months()
        self.logger.info(f"๐Ÿ“… Months to process: {len(self.months)} ({self.months[0]} to {self.months[-1]})")

    def generate_months(self):
        """Generate all months from 2016-01 to current month"""
        months = []
        current_date = datetime.now()
        
        for year in range(2016, current_date.year + 1):
            end_month = current_date.month if year == current_date.year else 12
            
            for month in range(1, end_month + 1):
                months.append(f"{year}-{month:02d}")
        
        return months

    def get_month_dates(self, year_month):
        """Get start and end dates for a month"""
        year, month = map(int, year_month.split('-'))
        last_day = monthrange(year, month)[1]
        
        start_date = f"{year}-{month:02d}-01T00:00:00"
        end_date = f"{year}-{month:02d}-{last_day:02d}T23:59:59"
        
        return start_date, end_date

    def search_month(self, query, year_month, limit=100, offset=0):
        """Search jobs for specific month"""
        url = f"{self.historical_url}/search"
        start_date, end_date = self.get_month_dates(year_month)
        
        params = {
            "q": query,
            "limit": limit,
            "offset": offset,
            "published-after": start_date,
            "published-before": end_date
        }
        
        try:
            response = requests.get(url, headers=self.headers, params=params, timeout=30)
            response.raise_for_status()
            
            self.stats['api_calls'] += 1
            return response.json()
            
        except Exception as e:
            self.logger.warning(f"API error for '{query}' {year_month}: {e}")
            return None

    def is_valid_digital_analyst_job(self, job):
        """Check if job title matches digital analyst criteria"""
        title = job.get('headline', '').lower()
        
        # Check if title contains any include keywords
        has_include_keyword = any(keyword.lower() in title for keyword in self.include_keywords)
        
        if not has_include_keyword:
            return False
        
        # Check if title contains any exclude keywords
        has_exclude_keyword = any(keyword.lower() in title for keyword in self.exclude_keywords)
        
        if has_exclude_keyword:
            return False
        
        return True

    def get_month_jobs(self, search_term, year_month):
        """Get all valid digital analyst jobs for one search term in one month"""
        jobs = []
        offset = 0
        limit = 100
        
        while True:
            data = self.search_month(search_term, year_month, limit, offset)
            
            if not data or 'hits' not in data or not data['hits']:
                break
            
            batch = data['hits']
            
            # Filter for valid digital analyst jobs only
            valid_jobs = []
            for job in batch:
                if self.is_valid_digital_analyst_job(job):
                    job['search_term'] = search_term
                    job['collection_month'] = year_month
                    job['scraped_at'] = datetime.now().isoformat()
                    valid_jobs.append(job)
                else:
                    self.stats['filtered_out'] += 1
            
            jobs.extend(valid_jobs)
            self.stats['jobs_found'] += len(valid_jobs)
            
            # Check if we got all
            total = data.get('total', {}).get('value', 0)
            if len(jobs) + offset >= total:
                break
            
            offset += limit
            time.sleep(0.2)  # Brief pause
            
            # Safety check
            if offset > 3000:
                self.logger.warning(f"Large month chunk: {search_term} {year_month}")
                break
        
        return jobs

    def save_checkpoint(self, completed_months):
        """Save progress checkpoint"""
        if not self.all_jobs:
            return
        
        checkpoint_file = f"digital_analyst_checkpoint_{completed_months}of{len(self.months)}_{datetime.now().strftime('%H%M%S')}.json"
        
        with open(checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump({
                'jobs': self.all_jobs,
                'completed_months': completed_months,
                'total_months': len(self.months),
                'stats': dict(self.stats),
                'timestamp': datetime.now().isoformat()
            }, f, ensure_ascii=False, indent=2, default=str)
        
        self.logger.info(f"๐Ÿ’พ Checkpoint: {checkpoint_file} ({len(self.all_jobs)} jobs)")

    def collect_all_jobs(self):
        """Main collection method - month by month"""
        self.logger.info("๐Ÿš€ DIGITAL ANALYST FOCUSED COLLECTION STARTING")
        self.logger.info("=" * 60)
        self.logger.info(f"๐Ÿ“… {len(self.months)} months to process")
        self.logger.info(f"๐Ÿ” Search terms: {len(self.search_terms)} terms")
        self.logger.info(f"โœ… Include keywords: {len(self.include_keywords)} terms")
        self.logger.info(f"โŒ Exclude: utvecklare, developer, sem vik")
        
        total_chunks = len(self.months) * len(self.search_terms)
        self.logger.info(f"๐Ÿ“ฆ {total_chunks:,} total month-term chunks")
        self.logger.info("=" * 60)
        
        chunk_count = 0
        
        for month_idx, year_month in enumerate(self.months):
            month_start = datetime.now()
            month_jobs_before = len(self.all_jobs)
            
            # Log progress every 6 months or early months
            if month_idx % 6 == 0 or month_idx < 5:
                progress = (month_idx / len(self.months)) * 100
                self.logger.info(f"\\n๐Ÿ“… MONTH {month_idx+1}/{len(self.months)}: {year_month} ({progress:.1f}%)")
                self.logger.info(f"   ๐Ÿ“Š Valid digital analyst jobs so far: {len(self.all_jobs):,}")
            
            # Process all search terms for this month
            for term in self.search_terms:
                chunk_count += 1
                
                try:
                    month_jobs = self.get_month_jobs(term, year_month)
                    self.all_jobs.extend(month_jobs)
                    
                    # Log significant finds
                    if len(month_jobs) > 3:
                        chunk_progress = (chunk_count / total_chunks) * 100
                        self.logger.info(f"   ๐Ÿ“ˆ {year_month} '{term}': {len(month_jobs)} valid jobs ({chunk_progress:.2f}%)")
                    
                except KeyboardInterrupt:
                    self.logger.info("โ„น๏ธ Interrupted - saving progress...")
                    self.save_checkpoint(month_idx)
                    raise
                except Exception as e:
                    self.failed_chunks.append(f"{term} - {year_month}: {e}")
                    self.logger.error(f"โŒ Failed: {term} {year_month}: {e}")
                
                time.sleep(0.1)
            
            # Month completed
            month_jobs_added = len(self.all_jobs) - month_jobs_before
            month_duration = datetime.now() - month_start
            
            if month_jobs_added > 10:
                self.logger.info(f"   โœ… {year_month}: +{month_jobs_added} valid jobs ({month_duration})")
            
            # Checkpoint every year
            if (month_idx + 1) % 12 == 0:
                self.save_checkpoint(month_idx + 1)
        
        # Remove duplicates
        self.logger.info("\\n๐Ÿงน Removing duplicates...")
        original_count = len(self.all_jobs)
        
        unique_jobs = {}
        for job in self.all_jobs:
            job_id = job.get('id')
            if job_id and job_id not in unique_jobs:
                unique_jobs[job_id] = job
        
        self.all_jobs = list(unique_jobs.values())
        removed = original_count - len(self.all_jobs)
        
        self.logger.info(f"๐Ÿ—‘๏ธ Removed {removed} duplicates")
        self.logger.info(f"โœ… Final dataset: {len(self.all_jobs):,} unique digital analyst jobs")
        
        return self.all_jobs

    def save_results(self):
        """Save final results"""
        if not self.all_jobs:
            return None
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Clean text for Excel - separate functions for different fields
        def clean_text_short(text):
            if not text:
                return ''
            text = str(text)
            text = re.sub(r'[^a-zA-Z0-9\\s\\.,;:\\-\\(\\)%/รฅรครถร…ร„ร–]', ' ', text)
            text = re.sub(r'\\s+', ' ', text).strip()
            return text[:500]
        
        def clean_text_full(text):
            if not text:
                return ''
            text = str(text)
            # Keep more characters for full descriptions, just clean up problematic ones
            text = re.sub(r'[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f-\\x9f]', ' ', text)  # Remove control characters
            text = re.sub(r'\\s+', ' ', text).strip()
            return text
        
        # Create clean dataset
        clean_data = []
        for job in self.all_jobs:
            employer = job.get('employer', {}) or {}
            workplace = job.get('workplace_address', {}) or {}
            salary = job.get('salary', {}) or {}
            employment = job.get('employment_type', {}) or {}
            
            pub_date = job.get('publication_date', '')
            year = pub_date[:4] if pub_date else ''
            month = pub_date[:7] if len(pub_date) >= 7 else ''
            
            # Extract complete job information
            description = job.get('description', {}) or {}
            application_details = job.get('application_details', {}) or {}
            must_have = job.get('must_have', {}) or {}
            nice_to_have = job.get('nice_to_have', {}) or {}
            occupation = job.get('occupation', {}) or {}
            
            clean_job = {
                'job_id': str(job.get('id', '')),
                'search_term': clean_text_short(job.get('search_term', '')),
                'title': clean_text_short(job.get('headline', '')),
                'company': clean_text_short(employer.get('name', '')),
                'city': clean_text_short(workplace.get('municipality', '')),
                'region': clean_text_short(workplace.get('region', '')),
                'publication_date': str(job.get('publication_date', '')),
                'year': year,
                'month': month,
                'employment_type': clean_text_short(employment.get('label', '')),
                'salary_min': str(salary.get('min', '')),
                'salary_max': str(salary.get('max', '')),
                'url': str(job.get('webpage_url', '')),
                # Complete job description and details
                'description_full': clean_text_full(description.get('text', '')),
                'requirements': clean_text_full(description.get('requirements', '')),
                'conditions': clean_text_full(description.get('conditions', '')),
                'company_information': clean_text_full(description.get('company_information', '')),
                # Application details
                'application_email': str(application_details.get('email', '')),
                'application_url': str(application_details.get('url', '')),
                'application_reference': clean_text_short(application_details.get('reference', '')),
                # Skills and requirements
                'must_have_skills': clean_text_full(str(must_have.get('skills', '') if must_have.get('skills') else '')),
                'must_have_education': clean_text_full(str(must_have.get('education', '') if must_have.get('education') else '')),
                'must_have_experience': clean_text_full(str(must_have.get('work_experiences', '') if must_have.get('work_experiences') else '')),
                'nice_to_have_skills': clean_text_full(str(nice_to_have.get('skills', '') if nice_to_have.get('skills') else '')),
                'nice_to_have_education': clean_text_full(str(nice_to_have.get('education', '') if nice_to_have.get('education') else '')),
                # Job classification
                'occupation_label': clean_text_short(occupation.get('label', '')),
                'number_of_vacancies': str(job.get('number_of_vacancies', '')),
                'application_deadline': str(job.get('application_deadline', ''))
            }
            clean_data.append(clean_job)
        
        # Save main CSV
        df = pd.DataFrame(clean_data)
        csv_file = f"digital_analyst_jobs_2016_2025_{timestamp}.csv"
        df.to_csv(csv_file, index=False, encoding='utf-8')
        self.logger.info(f"๐Ÿ“Š Main CSV: {csv_file}")
        
        # Try Excel
        excel_file = None
        try:
            excel_file = f"digital_analyst_jobs_2016_2025_{timestamp}.xlsx"
            df.to_excel(excel_file, index=False)
            self.logger.info(f"๐Ÿ“Š Excel: {excel_file}")
        except Exception as e:
            self.logger.warning(f"Excel failed: {e}")
        
        # Create trend analysis
        monthly_trends = defaultdict(int)
        yearly_trends = defaultdict(int)
        
        for job in clean_data:
            year = job['year']
            month = job['month']
            
            if month:
                monthly_trends[month] += 1
            
            if year:
                yearly_trends[year] += 1
        
        # Monthly trends file
        monthly_data = [{'month': month, 'digital_analyst_jobs': count} 
                       for month, count in sorted(monthly_trends.items())]
        monthly_df = pd.DataFrame(monthly_data)
        monthly_file = f"digital_analyst_monthly_trends_{timestamp}.csv"
        monthly_df.to_csv(monthly_file, index=False)
        self.logger.info(f"๐Ÿ“… Monthly trends: {monthly_file}")
        
        # Yearly trends file
        yearly_data = [{'year': year, 'digital_analyst_jobs': count} 
                      for year, count in sorted(yearly_trends.items())]
        yearly_df = pd.DataFrame(yearly_data)
        yearly_file = f"digital_analyst_yearly_trends_{timestamp}.csv"
        yearly_df.to_csv(yearly_file, index=False)
        self.logger.info(f"๐Ÿ“ˆ Yearly trends: {yearly_file}")
        
        # Search term breakdown
        term_trends = defaultdict(int)
        for job in clean_data:
            term = job['search_term']
            if term:
                term_trends[term] += 1
        
        term_data = [{'search_term': term, 'jobs_found': count} 
                    for term, count in sorted(term_trends.items(), key=lambda x: x[1], reverse=True)]
        term_df = pd.DataFrame(term_data)
        term_file = f"digital_analyst_search_term_breakdown_{timestamp}.csv"
        term_df.to_csv(term_file, index=False)
        self.logger.info(f"๐Ÿ” Search term breakdown: {term_file}")
        
        # Raw JSON backup
        json_file = f"raw_digital_analyst_data_{timestamp}.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(self.all_jobs, f, ensure_ascii=False, indent=2, default=str)
        self.logger.info(f"๐Ÿ’พ Raw JSON: {json_file}")
        
        return {
            'main_csv': csv_file,
            'excel': excel_file,
            'monthly_trends': monthly_file,
            'yearly_trends': yearly_file,
            'search_term_breakdown': term_file,
            'raw_json': json_file,
            'total_jobs': len(self.all_jobs)
        }

    def print_summary(self):
        """Print final summary"""
        if not self.all_jobs:
            print("โŒ No data collected")
            return
        
        print(f"\\n๐ŸŽ‰ DIGITAL ANALYST COLLECTION FINISHED!")
        print("=" * 55)
        print(f"โœ… Focused on Digital Analyst positions only")
        print(f"๐Ÿ“Š Total Jobs: {len(self.all_jobs):,}")
        print(f"๐Ÿ“… Period: {self.months[0]} to {self.months[-1]}")
        print(f"๐Ÿ” API Calls: {self.stats['api_calls']:,}")
        print(f"๐Ÿ“ˆ Valid Jobs Found: {self.stats.get('jobs_found', 0):,}")
        print(f"๐Ÿ”ฝ Jobs Filtered Out: {self.stats.get('filtered_out', 0):,}")
        print(f"โŒ Failed Chunks: {len(self.failed_chunks)}")
        
        # Year breakdown
        year_counts = defaultdict(int)
        for job in self.all_jobs:
            pub_date = job.get('publication_date', '')
            if pub_date:
                year = pub_date[:4]
                year_counts[year] += 1
        
        print(f"\\n๐Ÿ“ˆ YEARLY BREAKDOWN:")
        for year in sorted(year_counts.keys()):
            print(f"   {year}: {year_counts[year]:,} digital analyst jobs")
        
        # Search term breakdown
        term_counts = defaultdict(int)
        for job in self.all_jobs:
            term_counts[job.get('search_term', 'Unknown')] += 1
        
        print(f"\\n๐Ÿ” TOP SEARCH TERMS:")
        sorted_terms = sorted(term_counts.items(), key=lambda x: x[1], reverse=True)
        for term, count in sorted_terms[:10]:  # Show top 10
            pct = (count / len(self.all_jobs)) * 100
            print(f"   '{term}': {count:,} jobs ({pct:.1f}%)")
        
        # Include/exclude keyword analysis
        print(f"\\n๐ŸŽฏ KEYWORD ANALYSIS:")
        include_matches = defaultdict(int)
        exclude_matches = defaultdict(int)
        
        for job in self.all_jobs:
            title = job.get('headline', '').lower()
            
            # Count include keyword matches
            for keyword in self.include_keywords:
                if keyword.lower() in title:
                    include_matches[keyword] += 1
        
        print(f"   ๐Ÿ“ˆ Top Include Keywords Found:")
        sorted_includes = sorted(include_matches.items(), key=lambda x: x[1], reverse=True)
        for keyword, count in sorted_includes[:10]:
            pct = (count / len(self.all_jobs)) * 100
            print(f"      '{keyword}': {count:,} jobs ({pct:.1f}%)")

# Main execution
if __name__ == "__main__":
    print("๐Ÿ” DIGITAL ANALYST FOCUSED SCRAPER (2016-2025)")
    print("=" * 60)
    print("๐ŸŽฏ TARGET: Digital Analyst & Marketing Analytics positions")
    print("โœ… INCLUDE: Web analyst, Digital analyst, SEO, SEM, CRO, etc.")
    print("โŒ EXCLUDE: utvecklare, developer, sem vik")
    print("๐Ÿ—“๏ธ PERIOD: 2016 to now")
    print("๐Ÿ’พ Auto-saves checkpoints")
    print("=" * 60)
    
    months_count = len([f"{y}-{m:02d}" for y in range(2016, datetime.now().year + 1) 
                       for m in range(1, 13 if y < datetime.now().year else datetime.now().month + 1)])
    
    print(f"\\n๐Ÿ“Š Collection Details:")
    print(f"   ๐Ÿ“… Months: {months_count}")
    print(f"   ๐Ÿ” Search terms: 28 digital marketing/analytics terms")
    print(f"   ๐Ÿ“ฆ Total chunks: ~{months_count * 28:,}")
    print(f"   ๐ŸŽฏ Focus: Digital Analyst & Marketing Analytics only")
    print(f"   ๐Ÿšซ Excludes: Developer/Utvecklare/sem vik roles")
    
    print(f"\\n๐ŸŽฏ INCLUDE KEYWORDS:")
    include_keywords = [
        'Web analyst', 'Webbanalytiker', 'Webb analytiker', 'Webbstrateg',
        'Digital analyst', 'Digitalanalytiker', 'Digital analytiker', 
        'Marketing analyst', 'Digital strateg', 'Digital strategy',
        'Marknadsanalytiker', 'Konverteringsoptimerare', 'Konverteringsanalytiker',
        'Conversion specialist', 'CRO analyst', 'CRO konsult', 'SEO', 'SEM',
        'Paid search', 'Sรถkspecialist', 'GTM', 'E-commerce analyst',
        'Ecommerce analyst', 'Google Analytics', 'Marknadsundersรถkare', 
        'Growth analyst', 'Performance marketing', 'Marketing automation'
    ]
    for i, keyword in enumerate(include_keywords, 1):
        print(f"   {i:2d}. {keyword}")
    
    print(f"\\n๐Ÿšซ EXCLUDE KEYWORDS:")
    print(f"   1. Utvecklare")
    print(f"   2. Developer")
    print(f"   3. Sem vik")
    
    response = input(f"\\n๐Ÿš€ Start focused digital analyst collection? (y/n): ").lower()
    if response != 'y':
        print("Cancelled.")
        exit()
    
    scraper = DigitalAnalystScraper()
    
    try:
        start_time = datetime.now()
        print(f"\\nโฐ Started: {start_time}")
        
        jobs = scraper.collect_all_jobs()
        
        if jobs:
            files = scraper.save_results()
            scraper.print_summary()
            
            end_time = datetime.now()
            duration = end_time - start_time
            
            print(f"\\nโฐ Runtime: {duration}")
            print(f"โšก Rate: {len(jobs)/duration.total_seconds()*60:.1f} jobs/min")
            
            print(f"\\n๐Ÿ“ FILES CREATED:")
            print(f"   ๐Ÿ“Š {files['main_csv']}")
            if files['excel']:
                print(f"   ๐Ÿ“‹ {files['excel']}")
            print(f"   ๐Ÿ“ˆ {files['yearly_trends']}")
            print(f"   ๐Ÿ“… {files['monthly_trends']}")
            print(f"   ๐Ÿ” {files['search_term_breakdown']}")
            print(f"   ๐Ÿ’พ {files['raw_json']}")
            
            print(f"\\n๐Ÿ† SUCCESS!")
            print(f"โœ… Complete Digital Analyst dataset!")
            print(f"๐Ÿ“ˆ Perfect for digital marketing trend analysis!")
            
        else:
            print("โŒ No data collected")
            
    except KeyboardInterrupt:
        print("\\nโ„น๏ธ Stopped by user")
        if scraper.all_jobs:
            print("๐Ÿ’พ Saving partial data...")
            scraper.save_results()
            
    except Exception as e:
        print(f"\\nโŒ Error: {e}")
        if scraper.all_jobs:
            print("๐Ÿ’พ Saving partial data...")
            scraper.save_results()