import requests
import json
import pandas as pd
import time
from datetime import datetime, timedelta
import logging
from collections import defaultdict
import os
import re
from calendar import monthrange
class DigitalAnalystScraper:
def __init__(self):
self.historical_url = "<https://historical.api.jobtechdev.se>"
self.headers = {
"accept": "application/json"
}
self.all_jobs = []
self.stats = defaultdict(int)
self.failed_chunks = []
# Setup logging
log_filename = f'digital_analyst_scraper_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler(log_filename)
]
)
self.logger = logging.getLogger(__name__)
self.logger.info(f"๐ Log file: {log_filename}")
# Search terms - focused on digital analyst roles
self.search_terms = [
'web analyst',
'webbanalytiker',
'webb analytiker',
'webbstrateg',
'digital analyst',
'digitalanalytiker',
'digital analytiker',
'marketing analyst',
'digital strateg',
'digital strategy',
'marknadsanalytiker',
'konverteringsoptimerare',
'konverteringsanalytiker',
'conversion specialist',
'cro analyst',
'cro konsult',
'seo',
'sem',
'paid search',
'sรถkspecialist',
'gtm',
'e-commerce analyst',
'ecommerce analyst',
'google analytics',
'marknadsundersรถkare',
'growth analyst',
'performance marketing',
'marketing automation'
]
# Keywords that must be in title (case insensitive)
self.include_keywords = [
'web analyst',
'webbanalytiker',
'webb analytiker',
'webbstrateg',
'digital analyst',
'digitalanalytiker',
'digital analytiker',
'marketing analyst',
'digital strateg',
'digital strategy',
'marknadsanalytiker',
'konverteringsoptimerare',
'konverteringsanalytiker',
'conversion specialist',
'cro analyst',
'cro konsult',
'seo',
'sem',
'paid search',
'sรถkspecialist',
'gtm',
'e-commerce analyst',
'ecommerce analyst',
'google analytics',
'marknadsundersรถkare',
'growth analyst',
'performance marketing',
'marketing automation'
]
# Keywords that exclude the job (case insensitive)
self.exclude_keywords = [
'utvecklare',
'developer',
'sem vik'
]
# Generate all months from 2016 to now
self.months = self.generate_months()
self.logger.info(f"๐
Months to process: {len(self.months)} ({self.months[0]} to {self.months[-1]})")
def generate_months(self):
"""Generate all months from 2016-01 to current month"""
months = []
current_date = datetime.now()
for year in range(2016, current_date.year + 1):
end_month = current_date.month if year == current_date.year else 12
for month in range(1, end_month + 1):
months.append(f"{year}-{month:02d}")
return months
def get_month_dates(self, year_month):
"""Get start and end dates for a month"""
year, month = map(int, year_month.split('-'))
last_day = monthrange(year, month)[1]
start_date = f"{year}-{month:02d}-01T00:00:00"
end_date = f"{year}-{month:02d}-{last_day:02d}T23:59:59"
return start_date, end_date
def search_month(self, query, year_month, limit=100, offset=0):
"""Search jobs for specific month"""
url = f"{self.historical_url}/search"
start_date, end_date = self.get_month_dates(year_month)
params = {
"q": query,
"limit": limit,
"offset": offset,
"published-after": start_date,
"published-before": end_date
}
try:
response = requests.get(url, headers=self.headers, params=params, timeout=30)
response.raise_for_status()
self.stats['api_calls'] += 1
return response.json()
except Exception as e:
self.logger.warning(f"API error for '{query}' {year_month}: {e}")
return None
def is_valid_digital_analyst_job(self, job):
"""Check if job title matches digital analyst criteria"""
title = job.get('headline', '').lower()
# Check if title contains any include keywords
has_include_keyword = any(keyword.lower() in title for keyword in self.include_keywords)
if not has_include_keyword:
return False
# Check if title contains any exclude keywords
has_exclude_keyword = any(keyword.lower() in title for keyword in self.exclude_keywords)
if has_exclude_keyword:
return False
return True
def get_month_jobs(self, search_term, year_month):
"""Get all valid digital analyst jobs for one search term in one month"""
jobs = []
offset = 0
limit = 100
while True:
data = self.search_month(search_term, year_month, limit, offset)
if not data or 'hits' not in data or not data['hits']:
break
batch = data['hits']
# Filter for valid digital analyst jobs only
valid_jobs = []
for job in batch:
if self.is_valid_digital_analyst_job(job):
job['search_term'] = search_term
job['collection_month'] = year_month
job['scraped_at'] = datetime.now().isoformat()
valid_jobs.append(job)
else:
self.stats['filtered_out'] += 1
jobs.extend(valid_jobs)
self.stats['jobs_found'] += len(valid_jobs)
# Check if we got all
total = data.get('total', {}).get('value', 0)
if len(jobs) + offset >= total:
break
offset += limit
time.sleep(0.2) # Brief pause
# Safety check
if offset > 3000:
self.logger.warning(f"Large month chunk: {search_term} {year_month}")
break
return jobs
def save_checkpoint(self, completed_months):
"""Save progress checkpoint"""
if not self.all_jobs:
return
checkpoint_file = f"digital_analyst_checkpoint_{completed_months}of{len(self.months)}_{datetime.now().strftime('%H%M%S')}.json"
with open(checkpoint_file, 'w', encoding='utf-8') as f:
json.dump({
'jobs': self.all_jobs,
'completed_months': completed_months,
'total_months': len(self.months),
'stats': dict(self.stats),
'timestamp': datetime.now().isoformat()
}, f, ensure_ascii=False, indent=2, default=str)
self.logger.info(f"๐พ Checkpoint: {checkpoint_file} ({len(self.all_jobs)} jobs)")
def collect_all_jobs(self):
"""Main collection method - month by month"""
self.logger.info("๐ DIGITAL ANALYST FOCUSED COLLECTION STARTING")
self.logger.info("=" * 60)
self.logger.info(f"๐
{len(self.months)} months to process")
self.logger.info(f"๐ Search terms: {len(self.search_terms)} terms")
self.logger.info(f"โ
Include keywords: {len(self.include_keywords)} terms")
self.logger.info(f"โ Exclude: utvecklare, developer, sem vik")
total_chunks = len(self.months) * len(self.search_terms)
self.logger.info(f"๐ฆ {total_chunks:,} total month-term chunks")
self.logger.info("=" * 60)
chunk_count = 0
for month_idx, year_month in enumerate(self.months):
month_start = datetime.now()
month_jobs_before = len(self.all_jobs)
# Log progress every 6 months or early months
if month_idx % 6 == 0 or month_idx < 5:
progress = (month_idx / len(self.months)) * 100
self.logger.info(f"\\n๐
MONTH {month_idx+1}/{len(self.months)}: {year_month} ({progress:.1f}%)")
self.logger.info(f" ๐ Valid digital analyst jobs so far: {len(self.all_jobs):,}")
# Process all search terms for this month
for term in self.search_terms:
chunk_count += 1
try:
month_jobs = self.get_month_jobs(term, year_month)
self.all_jobs.extend(month_jobs)
# Log significant finds
if len(month_jobs) > 3:
chunk_progress = (chunk_count / total_chunks) * 100
self.logger.info(f" ๐ {year_month} '{term}': {len(month_jobs)} valid jobs ({chunk_progress:.2f}%)")
except KeyboardInterrupt:
self.logger.info("โน๏ธ Interrupted - saving progress...")
self.save_checkpoint(month_idx)
raise
except Exception as e:
self.failed_chunks.append(f"{term} - {year_month}: {e}")
self.logger.error(f"โ Failed: {term} {year_month}: {e}")
time.sleep(0.1)
# Month completed
month_jobs_added = len(self.all_jobs) - month_jobs_before
month_duration = datetime.now() - month_start
if month_jobs_added > 10:
self.logger.info(f" โ
{year_month}: +{month_jobs_added} valid jobs ({month_duration})")
# Checkpoint every year
if (month_idx + 1) % 12 == 0:
self.save_checkpoint(month_idx + 1)
# Remove duplicates
self.logger.info("\\n๐งน Removing duplicates...")
original_count = len(self.all_jobs)
unique_jobs = {}
for job in self.all_jobs:
job_id = job.get('id')
if job_id and job_id not in unique_jobs:
unique_jobs[job_id] = job
self.all_jobs = list(unique_jobs.values())
removed = original_count - len(self.all_jobs)
self.logger.info(f"๐๏ธ Removed {removed} duplicates")
self.logger.info(f"โ
Final dataset: {len(self.all_jobs):,} unique digital analyst jobs")
return self.all_jobs
def save_results(self):
"""Save final results"""
if not self.all_jobs:
return None
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Clean text for Excel - separate functions for different fields
def clean_text_short(text):
if not text:
return ''
text = str(text)
text = re.sub(r'[^a-zA-Z0-9\\s\\.,;:\\-\\(\\)%/รฅรครถร
รร]', ' ', text)
text = re.sub(r'\\s+', ' ', text).strip()
return text[:500]
def clean_text_full(text):
if not text:
return ''
text = str(text)
# Keep more characters for full descriptions, just clean up problematic ones
text = re.sub(r'[\\x00-\\x08\\x0b\\x0c\\x0e-\\x1f\\x7f-\\x9f]', ' ', text) # Remove control characters
text = re.sub(r'\\s+', ' ', text).strip()
return text
# Create clean dataset
clean_data = []
for job in self.all_jobs:
employer = job.get('employer', {}) or {}
workplace = job.get('workplace_address', {}) or {}
salary = job.get('salary', {}) or {}
employment = job.get('employment_type', {}) or {}
pub_date = job.get('publication_date', '')
year = pub_date[:4] if pub_date else ''
month = pub_date[:7] if len(pub_date) >= 7 else ''
# Extract complete job information
description = job.get('description', {}) or {}
application_details = job.get('application_details', {}) or {}
must_have = job.get('must_have', {}) or {}
nice_to_have = job.get('nice_to_have', {}) or {}
occupation = job.get('occupation', {}) or {}
clean_job = {
'job_id': str(job.get('id', '')),
'search_term': clean_text_short(job.get('search_term', '')),
'title': clean_text_short(job.get('headline', '')),
'company': clean_text_short(employer.get('name', '')),
'city': clean_text_short(workplace.get('municipality', '')),
'region': clean_text_short(workplace.get('region', '')),
'publication_date': str(job.get('publication_date', '')),
'year': year,
'month': month,
'employment_type': clean_text_short(employment.get('label', '')),
'salary_min': str(salary.get('min', '')),
'salary_max': str(salary.get('max', '')),
'url': str(job.get('webpage_url', '')),
# Complete job description and details
'description_full': clean_text_full(description.get('text', '')),
'requirements': clean_text_full(description.get('requirements', '')),
'conditions': clean_text_full(description.get('conditions', '')),
'company_information': clean_text_full(description.get('company_information', '')),
# Application details
'application_email': str(application_details.get('email', '')),
'application_url': str(application_details.get('url', '')),
'application_reference': clean_text_short(application_details.get('reference', '')),
# Skills and requirements
'must_have_skills': clean_text_full(str(must_have.get('skills', '') if must_have.get('skills') else '')),
'must_have_education': clean_text_full(str(must_have.get('education', '') if must_have.get('education') else '')),
'must_have_experience': clean_text_full(str(must_have.get('work_experiences', '') if must_have.get('work_experiences') else '')),
'nice_to_have_skills': clean_text_full(str(nice_to_have.get('skills', '') if nice_to_have.get('skills') else '')),
'nice_to_have_education': clean_text_full(str(nice_to_have.get('education', '') if nice_to_have.get('education') else '')),
# Job classification
'occupation_label': clean_text_short(occupation.get('label', '')),
'number_of_vacancies': str(job.get('number_of_vacancies', '')),
'application_deadline': str(job.get('application_deadline', ''))
}
clean_data.append(clean_job)
# Save main CSV
df = pd.DataFrame(clean_data)
csv_file = f"digital_analyst_jobs_2016_2025_{timestamp}.csv"
df.to_csv(csv_file, index=False, encoding='utf-8')
self.logger.info(f"๐ Main CSV: {csv_file}")
# Try Excel
excel_file = None
try:
excel_file = f"digital_analyst_jobs_2016_2025_{timestamp}.xlsx"
df.to_excel(excel_file, index=False)
self.logger.info(f"๐ Excel: {excel_file}")
except Exception as e:
self.logger.warning(f"Excel failed: {e}")
# Create trend analysis
monthly_trends = defaultdict(int)
yearly_trends = defaultdict(int)
for job in clean_data:
year = job['year']
month = job['month']
if month:
monthly_trends[month] += 1
if year:
yearly_trends[year] += 1
# Monthly trends file
monthly_data = [{'month': month, 'digital_analyst_jobs': count}
for month, count in sorted(monthly_trends.items())]
monthly_df = pd.DataFrame(monthly_data)
monthly_file = f"digital_analyst_monthly_trends_{timestamp}.csv"
monthly_df.to_csv(monthly_file, index=False)
self.logger.info(f"๐
Monthly trends: {monthly_file}")
# Yearly trends file
yearly_data = [{'year': year, 'digital_analyst_jobs': count}
for year, count in sorted(yearly_trends.items())]
yearly_df = pd.DataFrame(yearly_data)
yearly_file = f"digital_analyst_yearly_trends_{timestamp}.csv"
yearly_df.to_csv(yearly_file, index=False)
self.logger.info(f"๐ Yearly trends: {yearly_file}")
# Search term breakdown
term_trends = defaultdict(int)
for job in clean_data:
term = job['search_term']
if term:
term_trends[term] += 1
term_data = [{'search_term': term, 'jobs_found': count}
for term, count in sorted(term_trends.items(), key=lambda x: x[1], reverse=True)]
term_df = pd.DataFrame(term_data)
term_file = f"digital_analyst_search_term_breakdown_{timestamp}.csv"
term_df.to_csv(term_file, index=False)
self.logger.info(f"๐ Search term breakdown: {term_file}")
# Raw JSON backup
json_file = f"raw_digital_analyst_data_{timestamp}.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(self.all_jobs, f, ensure_ascii=False, indent=2, default=str)
self.logger.info(f"๐พ Raw JSON: {json_file}")
return {
'main_csv': csv_file,
'excel': excel_file,
'monthly_trends': monthly_file,
'yearly_trends': yearly_file,
'search_term_breakdown': term_file,
'raw_json': json_file,
'total_jobs': len(self.all_jobs)
}
def print_summary(self):
"""Print final summary"""
if not self.all_jobs:
print("โ No data collected")
return
print(f"\\n๐ DIGITAL ANALYST COLLECTION FINISHED!")
print("=" * 55)
print(f"โ
Focused on Digital Analyst positions only")
print(f"๐ Total Jobs: {len(self.all_jobs):,}")
print(f"๐
Period: {self.months[0]} to {self.months[-1]}")
print(f"๐ API Calls: {self.stats['api_calls']:,}")
print(f"๐ Valid Jobs Found: {self.stats.get('jobs_found', 0):,}")
print(f"๐ฝ Jobs Filtered Out: {self.stats.get('filtered_out', 0):,}")
print(f"โ Failed Chunks: {len(self.failed_chunks)}")
# Year breakdown
year_counts = defaultdict(int)
for job in self.all_jobs:
pub_date = job.get('publication_date', '')
if pub_date:
year = pub_date[:4]
year_counts[year] += 1
print(f"\\n๐ YEARLY BREAKDOWN:")
for year in sorted(year_counts.keys()):
print(f" {year}: {year_counts[year]:,} digital analyst jobs")
# Search term breakdown
term_counts = defaultdict(int)
for job in self.all_jobs:
term_counts[job.get('search_term', 'Unknown')] += 1
print(f"\\n๐ TOP SEARCH TERMS:")
sorted_terms = sorted(term_counts.items(), key=lambda x: x[1], reverse=True)
for term, count in sorted_terms[:10]: # Show top 10
pct = (count / len(self.all_jobs)) * 100
print(f" '{term}': {count:,} jobs ({pct:.1f}%)")
# Include/exclude keyword analysis
print(f"\\n๐ฏ KEYWORD ANALYSIS:")
include_matches = defaultdict(int)
exclude_matches = defaultdict(int)
for job in self.all_jobs:
title = job.get('headline', '').lower()
# Count include keyword matches
for keyword in self.include_keywords:
if keyword.lower() in title:
include_matches[keyword] += 1
print(f" ๐ Top Include Keywords Found:")
sorted_includes = sorted(include_matches.items(), key=lambda x: x[1], reverse=True)
for keyword, count in sorted_includes[:10]:
pct = (count / len(self.all_jobs)) * 100
print(f" '{keyword}': {count:,} jobs ({pct:.1f}%)")
# Main execution
if __name__ == "__main__":
print("๐ DIGITAL ANALYST FOCUSED SCRAPER (2016-2025)")
print("=" * 60)
print("๐ฏ TARGET: Digital Analyst & Marketing Analytics positions")
print("โ
INCLUDE: Web analyst, Digital analyst, SEO, SEM, CRO, etc.")
print("โ EXCLUDE: utvecklare, developer, sem vik")
print("๐๏ธ PERIOD: 2016 to now")
print("๐พ Auto-saves checkpoints")
print("=" * 60)
months_count = len([f"{y}-{m:02d}" for y in range(2016, datetime.now().year + 1)
for m in range(1, 13 if y < datetime.now().year else datetime.now().month + 1)])
print(f"\\n๐ Collection Details:")
print(f" ๐
Months: {months_count}")
print(f" ๐ Search terms: 28 digital marketing/analytics terms")
print(f" ๐ฆ Total chunks: ~{months_count * 28:,}")
print(f" ๐ฏ Focus: Digital Analyst & Marketing Analytics only")
print(f" ๐ซ Excludes: Developer/Utvecklare/sem vik roles")
print(f"\\n๐ฏ INCLUDE KEYWORDS:")
include_keywords = [
'Web analyst', 'Webbanalytiker', 'Webb analytiker', 'Webbstrateg',
'Digital analyst', 'Digitalanalytiker', 'Digital analytiker',
'Marketing analyst', 'Digital strateg', 'Digital strategy',
'Marknadsanalytiker', 'Konverteringsoptimerare', 'Konverteringsanalytiker',
'Conversion specialist', 'CRO analyst', 'CRO konsult', 'SEO', 'SEM',
'Paid search', 'Sรถkspecialist', 'GTM', 'E-commerce analyst',
'Ecommerce analyst', 'Google Analytics', 'Marknadsundersรถkare',
'Growth analyst', 'Performance marketing', 'Marketing automation'
]
for i, keyword in enumerate(include_keywords, 1):
print(f" {i:2d}. {keyword}")
print(f"\\n๐ซ EXCLUDE KEYWORDS:")
print(f" 1. Utvecklare")
print(f" 2. Developer")
print(f" 3. Sem vik")
response = input(f"\\n๐ Start focused digital analyst collection? (y/n): ").lower()
if response != 'y':
print("Cancelled.")
exit()
scraper = DigitalAnalystScraper()
try:
start_time = datetime.now()
print(f"\\nโฐ Started: {start_time}")
jobs = scraper.collect_all_jobs()
if jobs:
files = scraper.save_results()
scraper.print_summary()
end_time = datetime.now()
duration = end_time - start_time
print(f"\\nโฐ Runtime: {duration}")
print(f"โก Rate: {len(jobs)/duration.total_seconds()*60:.1f} jobs/min")
print(f"\\n๐ FILES CREATED:")
print(f" ๐ {files['main_csv']}")
if files['excel']:
print(f" ๐ {files['excel']}")
print(f" ๐ {files['yearly_trends']}")
print(f" ๐
{files['monthly_trends']}")
print(f" ๐ {files['search_term_breakdown']}")
print(f" ๐พ {files['raw_json']}")
print(f"\\n๐ SUCCESS!")
print(f"โ
Complete Digital Analyst dataset!")
print(f"๐ Perfect for digital marketing trend analysis!")
else:
print("โ No data collected")
except KeyboardInterrupt:
print("\\nโน๏ธ Stopped by user")
if scraper.all_jobs:
print("๐พ Saving partial data...")
scraper.save_results()
except Exception as e:
print(f"\\nโ Error: {e}")
if scraper.all_jobs:
print("๐พ Saving partial data...")
scraper.save_results()