"""Funda.nl scraper - request based using Nuxt data extraction"""
import json
import time
import random
import os
from bs4 import BeautifulSoup
from curl_cffi import requests
from datetime import datetime, date
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Proxy configuration from .env
PROXY_HOST = os.getenv('PROXY_HOST')
PROXY_PORT = os.getenv('PROXY_PORT')
PROXY_USER = os.getenv('PROXY_USER')
PROXY_PASS_PREFIX = os.getenv('PROXY_PASS_PREFIX')

# Proxy session list (from proxy-list.txt)
PROXY_SESSIONS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'proxy-list.txt')

def load_proxy_sessions():
    """Load proxy sessions from file"""
    try:
        with open(PROXY_SESSIONS_FILE, 'r') as f:
            return [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        return []

PROXY_SESSIONS = load_proxy_sessions()

def get_random_proxy():
    """Get a random proxy URL"""
    if not all([PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS_PREFIX]) or not PROXY_SESSIONS:
        return None
    
    session_suffix = random.choice(PROXY_SESSIONS)
    password = f"{PROXY_PASS_PREFIX}{session_suffix}"
    proxy_url = f"http://{PROXY_USER}:{password}@{PROXY_HOST}:{PROXY_PORT}"
    return proxy_url

class FundaScraper:
    """Scraper for Funda.nl using Nuxt data extraction"""
    
    BASE_URL = 'https://www.funda.nl/zoeken/koop'
    HEADERS = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Chrome/143.0.0.0'}
    
    def __init__(self, area='amsterdam', use_proxy=True):
        self.area = area
        self.use_proxy = use_proxy and bool(PROXY_SESSIONS)
        self.session = requests.Session()
        if self.use_proxy:
            print(f"Proxy enabled: {len(PROXY_SESSIONS)} sessions available")
    
    def _get_proxy_dict(self):
        """Get proxy dict for request"""
        if not self.use_proxy:
            return None
        proxy_url = get_random_proxy()
        if proxy_url:
            return {'http': proxy_url, 'https': proxy_url}
        return None
    
    def _fetch_page(self, page=1):
        """Fetch a single page of results"""
        params = {
            'selected_area': f'["{self.area}"]',
            'sort': '"date_down"',
            'search_result': page
        }
        url = f'{self.BASE_URL}?selected_area={params["selected_area"]}&sort={params["sort"]}&search_result={page}'
        
        proxies = self._get_proxy_dict()
        response = self.session.get(url, headers=self.HEADERS, impersonate='chrome', proxies=proxies)
        if response.status_code != 200:
            print(f"Error: HTTP {response.status_code}")
            return None
        return response.text
    
    def _parse_nuxt_data(self, html):
        """Parse Nuxt data from HTML"""
        soup = BeautifulSoup(html, 'html.parser')
        script = soup.find('script', {'id': '__NUXT_DATA__'})
        if not script:
            return None, 0
        
        data = json.loads(script.string)
        
        def deref(idx):
            """Dereference index to get actual value"""
            if isinstance(idx, int) and 0 <= idx < len(data):
                return data[idx]
            return idx
        
        def get_floor_area(floor_area_idx):
            """Extract floor area: floor_area -> [idx] -> value"""
            val = deref(floor_area_idx)
            if isinstance(val, list) and val:
                first = deref(val[0])
                if isinstance(first, (int, float)) and 5 <= first <= 5000:
                    return first
            return None
        
        # Get total count from pinia store
        total_count = 0
        for item in data:
            if isinstance(item, dict) and 'totalListingsCount' in item:
                total_count = item.get('totalListingsCount', 0)
                break
        
        listings = []
        for i, item in enumerate(data):
            if isinstance(item, dict) and 'object_detail_page_relative_url' in item and 'address' in item:
                # Extract address
                addr = deref(item.get('address'))
                street = deref(addr.get('street_name', '')) if isinstance(addr, dict) else ''
                num = deref(addr.get('house_number', '')) if isinstance(addr, dict) else ''
                city = deref(addr.get('city', '')) if isinstance(addr, dict) else ''
                pc = deref(addr.get('postal_code', '')) if isinstance(addr, dict) else ''
                
                # Extract price
                price_val = None
                price = deref(item.get('price'))
                if isinstance(price, dict):
                    sp = deref(price.get('selling_price'))
                    if isinstance(sp, list) and sp:
                        price_val = deref(sp[0])
                
                # Floor area (nested)
                floor_val = get_floor_area(item.get('floor_area'))
                
                # Simple fields
                rooms = deref(item.get('number_of_rooms'))
                if not isinstance(rooms, int) or rooms > 50:
                    rooms = None
                
                bedrooms = deref(item.get('number_of_bedrooms'))
                if not isinstance(bedrooms, int) or bedrooms > 50:
                    bedrooms = None
                
                energy = deref(item.get('energy_label'))
                if not isinstance(energy, str):
                    energy = None
                
                publish = deref(item.get('publish_date'))
                if not isinstance(publish, str):
                    publish = None
                
                url_path = deref(item.get('object_detail_page_relative_url', ''))
                if not isinstance(url_path, str):
                    continue
                
                # Extract ID from URL
                listing_id = url_path.rstrip('/').split('/')[-1] if url_path else None
                
                # Extract first image URL from thumbnail_id
                # thumbnail_id contains numeric IDs like 223087476 which maps to
                # https://cloud.funda.nl/valentina_media/223/087/476.jpg
                image_url = None
                thumb_ids = deref(item.get('thumbnail_id'))
                if isinstance(thumb_ids, list) and thumb_ids:
                    first_thumb = deref(thumb_ids[0])
                    if isinstance(first_thumb, (int, str)):
                        thumb_str = str(first_thumb)
                        if len(thumb_str) >= 9:
                            # Split into 3-digit chunks: 223087476 -> 223/087/476
                            path = '/'.join([thumb_str[i:i+3] for i in range(0, len(thumb_str), 3)])
                            image_url = f"https://cloud.funda.nl/valentina_media/{path}.jpg"
                
                listings.append({
                    'id': listing_id,
                    'address': f"{street} {num}".strip(),
                    'postal_code': pc,
                    'city': city,
                    'price': price_val,
                    'floor_area_m2': floor_val,
                    'rooms': rooms,
                    'bedrooms': bedrooms,
                    'energy_label': energy,
                    'url': f"https://www.funda.nl{url_path}",
                    'publish_date': publish,
                    'image_url': image_url,  # Extracted from thumbnail_id
                    'description': None,  # Will be fetched separately if requested
                })
        
        return listings, total_count
    
    def _fetch_details(self, url):
        """Fetch description and image URL from a listing detail page"""
        result = {'description': None, 'image_url': None}
        try:
            proxies = self._get_proxy_dict()
            response = self.session.get(url, headers=self.HEADERS, impersonate='chrome', proxies=proxies)
            if response.status_code != 200:
                return result
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract first image URL from img tag with data-nuxt-img
            img = soup.find('img', {'data-nuxt-img': True})
            if img and img.get('src'):
                result['image_url'] = img['src'].split('?')[0]  # Remove query params for cleaner URL
            
            # Try to find description in Nuxt data first
            script = soup.find('script', {'id': '__NUXT_DATA__'})
            if script:
                data = json.loads(script.string)
                
                # Look for description field in Nuxt data
                for item in data:
                    if isinstance(item, str) and len(item) > 100:
                        # Long strings are likely descriptions
                        # Check if it's not a URL or JSON
                        if not item.startswith(('http', '{', '[', '<')):
                            result['description'] = item.strip()
                            break
                
                # Alternative: look for 'description' key
                if not result['description']:
                    for i, item in enumerate(data):
                        if isinstance(item, dict) and 'description' in item:
                            desc_idx = item['description']
                            if isinstance(desc_idx, int) and 0 <= desc_idx < len(data):
                                desc = data[desc_idx]
                                if isinstance(desc, str):
                                    result['description'] = desc.strip()
                                    break
            
            # Fallback: try to find description in HTML directly
            if not result['description']:
                desc_section = soup.find('div', {'data-test-id': 'object-description-body'})
                if desc_section:
                    result['description'] = desc_section.get_text(separator=' ', strip=True)
            
            return result
        except Exception as e:
            return result
    
    def _fetch_description(self, url):
        """Fetch the description (Omschrijving) from a listing detail page - legacy method"""
        details = self._fetch_details(url)
        return details['description']
    
    def scrape(self, filter_today=True, max_pages=10, fetch_descriptions=False):
        """
        Scrape Funda listings.
        
        Args:
            filter_today: If True, only return listings published today
            max_pages: Maximum number of pages to scrape
            fetch_descriptions: If True, fetch description from each listing's detail page
        
        Returns:
            List of listing dictionaries
        """
        all_listings = []
        seen_ids = set()
        today = date.today()
        
        for page in range(1, max_pages + 1):
            print(f"Fetching page {page}...", end=' ')
            html = self._fetch_page(page)
            if not html:
                break
            
            listings, total_count = self._parse_nuxt_data(html)
            if not listings:
                print("no listings found")
                break
            
            print(f"found {len(listings)} listings")
            
            new_count = 0
            stop_pagination = False
            
            for listing in listings:
                # Skip duplicates
                if listing['id'] in seen_ids:
                    continue
                seen_ids.add(listing['id'])
                
                # Filter by today if requested
                if filter_today and listing['publish_date']:
                    try:
                        pub_str = listing['publish_date'].split('.')[0].replace('T', ' ')
                        pub_date = datetime.fromisoformat(pub_str).date()
                        if pub_date < today:
                            # Past date, stop pagination for sorted-by-date results
                            stop_pagination = True
                            continue
                    except Exception as e:
                        pass
                
                all_listings.append(listing)
                new_count += 1
            
            # Stop if we're past today's listings or no new listings
            if stop_pagination or new_count == 0:
                break
            
            # Rate limiting with random delay
            time.sleep(random.uniform(1.0, 2.5))
        
        # Fetch descriptions and images if requested
        if fetch_descriptions and all_listings:
            print(f"\nFetching details for {len(all_listings)} listings...")
            for i, listing in enumerate(all_listings, 1):
                print(f"  [{i}/{len(all_listings)}] {listing['address']}...", end=' ')
                details = self._fetch_details(listing['url'])
                listing['description'] = details['description']
                listing['image_url'] = details['image_url']
                if listing['description']:
                    print(f"OK ({len(listing['description'])} chars)")
                else:
                    print("not found")
                time.sleep(random.uniform(1.0, 3.0))  # Random delay between requests
        
        return all_listings
    
    def to_json(self, listings, filepath=None, merge=True):
        """Export listings to JSON, merging with existing file and overwriting duplicates by ID"""
        if filepath and merge:
            existing = []
            try:
                with open(filepath, 'r') as f:
                    existing = json.load(f)
            except (FileNotFoundError, json.JSONDecodeError):
                pass
            
            # Build dict keyed by ID - new listings overwrite old ones
            listings_by_id = {l['id']: l for l in existing}
            new_count = 0
            updated_count = 0
            for l in listings:
                if l['id'] in listings_by_id:
                    updated_count += 1
                else:
                    new_count += 1
                listings_by_id[l['id']] = l
            
            merged = list(listings_by_id.values())
            # Sort by publish_date descending
            merged.sort(key=lambda x: x.get('publish_date', ''), reverse=True)
            
            with open(filepath, 'w') as f:
                json.dump(merged, f, indent=2, ensure_ascii=False)
            print(f"Saved {len(merged)} listings to {filepath} ({new_count} new, {updated_count} updated)")
            return json.dumps(merged, indent=2, ensure_ascii=False)
        
        if filepath:
            with open(filepath, 'w') as f:
                json.dump(listings, f, indent=2, ensure_ascii=False)
            print(f"Saved {len(listings)} listings to {filepath}")
        return json.dumps(listings, indent=2, ensure_ascii=False)


def scrape_funda_amsterdam(filter_today=True):
    """Legacy function for backward compatibility"""
    scraper = FundaScraper(area='amsterdam')
    return scraper.scrape(filter_today=filter_today)


if __name__ == '__main__':
    import argparse
    
    parser = argparse.ArgumentParser(description='Scrape Funda.nl listings for Amsterdam')
    parser.add_argument('--today', action='store_true', help='Only show listings from today')
    parser.add_argument('--pages', type=int, default=5, help='Max pages to scrape (default: 5)')
    parser.add_argument('--output', '-o', type=str, help='Output JSON file path')
    parser.add_argument('--area', type=str, default='amsterdam', help='Area to search (default: amsterdam)')
    parser.add_argument('--descriptions', '-d', action='store_true', help='Fetch description (Omschrijving) for each listing')
    args = parser.parse_args()
    
    print("=" * 60)
    print(f"Funda.nl {args.area.title()} Listings Scraper")
    print("=" * 60)
    
    scraper = FundaScraper(area=args.area)
    listings = scraper.scrape(filter_today=args.today, max_pages=args.pages, fetch_descriptions=args.descriptions)
    
    print(f"\n{'=' * 60}")
    print(f"Total listings found: {len(listings)}")
    print("=" * 60)
    
    # Save to JSON if output specified
    if args.output:
        scraper.to_json(listings, args.output)
    
    # Print listings
    for i, l in enumerate(listings, 1):
        price_str = f"€ {l['price']:,.0f}".replace(",", ".") if l['price'] else "N/A"
        floor_str = f"{int(l['floor_area_m2'])} m²" if l['floor_area_m2'] else "N/A"
        rooms_str = f"{l['rooms']} rooms" if l['rooms'] else "N/A"
        beds_str = f"/ {l['bedrooms']} beds " if l['bedrooms'] else " "
        energy_str = l['energy_label'] or 'N/A'
        
        print(f"\n{i}. {l['address']}")
        print(f"   {l['postal_code']} {l['city']}")
        print(f"   {price_str} | {floor_str} | {rooms_str} {beds_str}| Energy: {energy_str}")
        print(f"   Published: {l['publish_date']}")
        print(f"   {l['url']}")
        if l.get('description'):
            desc_preview = l['description'][:200] + '...' if len(l['description']) > 200 else l['description']
            print(f"   Omschrijving: {desc_preview}")
