diff --git a/src/database.py b/src/database.py index da8d579..5af71dd 100644 --- a/src/database.py +++ b/src/database.py @@ -11,7 +11,7 @@ from datetime import datetime, timedelta from typing import Optional, List, Dict, Any from zoneinfo import ZoneInfo -from sqlalchemy import create_engine, func, distinct, case, event +from sqlalchemy import create_engine, func, distinct, case, event, or_ from sqlalchemy.orm import sessionmaker, scoped_session, Session from sqlalchemy.engine import Engine @@ -432,21 +432,22 @@ class DatabaseManager: def get_unenriched_ips(self, limit: int = 100) -> List[str]: """ - Get IPs that don't have reputation data yet. + Get IPs that don't have complete reputation data yet. + Returns IPs without country_code OR without city data. Excludes RFC1918 private addresses and other non-routable IPs. Args: limit: Maximum number of IPs to return Returns: - List of IP addresses without reputation data + List of IP addresses without complete reputation data """ session = self.session try: ips = ( session.query(IpStats.ip) .filter( - IpStats.country_code.is_(None), + or_(IpStats.country_code.is_(None), IpStats.city.is_(None)), ~IpStats.ip.like("10.%"), ~IpStats.ip.like("172.16.%"), ~IpStats.ip.like("172.17.%"), diff --git a/src/geo_utils.py b/src/geo_utils.py new file mode 100644 index 0000000..d11f01c --- /dev/null +++ b/src/geo_utils.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Geolocation utilities for reverse geocoding and city lookups. +""" + +import requests +from typing import Optional, Tuple +from logger import get_app_logger + +app_logger = get_app_logger() + +# Simple city name cache to avoid repeated API calls +_city_cache = {} + + +def reverse_geocode_city(latitude: float, longitude: float) -> Optional[str]: + """ + Reverse geocode coordinates to get city name using Nominatim (OpenStreetMap). + + Args: + latitude: Latitude coordinate + longitude: Longitude coordinate + + Returns: + City name or None if not found + """ + # Check cache first + cache_key = f"{latitude},{longitude}" + if cache_key in _city_cache: + return _city_cache[cache_key] + + try: + # Use Nominatim reverse geocoding API (free, no API key required) + url = "https://nominatim.openstreetmap.org/reverse" + params = { + "lat": latitude, + "lon": longitude, + "format": "json", + "zoom": 10, # City level + "addressdetails": 1, + } + headers = {"User-Agent": "Krawl-Honeypot/1.0"} # Required by Nominatim ToS + + response = requests.get(url, params=params, headers=headers, timeout=5) + response.raise_for_status() + + data = response.json() + address = data.get("address", {}) + + # Try to get city from various possible fields + city = ( + address.get("city") + or address.get("town") + or address.get("village") + or address.get("municipality") + or address.get("county") + ) + + # Cache the result + _city_cache[cache_key] = city + + if city: + app_logger.debug(f"Reverse geocoded {latitude},{longitude} to {city}") + + return city + + except requests.RequestException as e: + app_logger.warning(f"Reverse geocoding failed for {latitude},{longitude}: {e}") + return None + except Exception as e: + app_logger.error(f"Error in reverse geocoding: {e}") + return None + + +def get_most_recent_geoip_data(results: list) -> Optional[dict]: + """ + Extract the most recent geoip_data from API results. + Results are assumed to be sorted by record_added (most recent first). + + Args: + results: List of result dictionaries from IP reputation API + + Returns: + Most recent geoip_data dict or None + """ + if not results: + return None + + # The first result is the most recent (sorted by record_added) + most_recent = results[0] + return most_recent.get("geoip_data") + + +def extract_city_from_coordinates(geoip_data: dict) -> Optional[str]: + """ + Extract city name from geoip_data using reverse geocoding. + + Args: + geoip_data: Dictionary containing location_latitude and location_longitude + + Returns: + City name or None + """ + if not geoip_data: + return None + + latitude = geoip_data.get("location_latitude") + longitude = geoip_data.get("location_longitude") + + if latitude is None or longitude is None: + return None + + return reverse_geocode_city(latitude, longitude) diff --git a/src/tasks/fetch_ip_rep.py b/src/tasks/fetch_ip_rep.py index be74553..a005c62 100644 --- a/src/tasks/fetch_ip_rep.py +++ b/src/tasks/fetch_ip_rep.py @@ -2,6 +2,7 @@ from database import get_database from logger import get_app_logger import requests from sanitizer import sanitize_for_storage, sanitize_dict +from geo_utils import get_most_recent_geoip_data, extract_city_from_coordinates # ---------------------- # TASK CONFIG @@ -33,13 +34,20 @@ def main(): payload = response.json() if payload.get("results"): - data = payload["results"][0] - geoip_data = data["geoip_data"] + results = payload["results"] + + # Get the most recent result (first in list, sorted by record_added) + most_recent = results[0] + geoip_data = most_recent.get("geoip_data", {}) + list_on = most_recent.get("list_on", {}) + + # Extract standard fields country_iso_code = geoip_data.get("country_iso_code") asn = geoip_data.get("asn_autonomous_system_number") asn_org = geoip_data.get("asn_autonomous_system_organization") - city = geoip_data.get("city_name") # Extract city name from API - list_on = data["list_on"] + + # Extract city from coordinates using reverse geocoding + city = extract_city_from_coordinates(geoip_data) sanitized_country_iso_code = sanitize_for_storage(country_iso_code, 3) sanitized_asn = sanitize_for_storage(asn, 100) @@ -53,7 +61,7 @@ def main(): sanitized_asn, sanitized_asn_org, sanitized_list_on, - sanitized_city, # Pass city to database + sanitized_city, ) except requests.RequestException as e: app_logger.warning(f"Failed to fetch IP rep for {ip}: {e}") diff --git a/tests/test_insert_fake_ips.py b/tests/test_insert_fake_ips.py index bb4a777..bbce7e4 100644 --- a/tests/test_insert_fake_ips.py +++ b/tests/test_insert_fake_ips.py @@ -7,7 +7,8 @@ This generates realistic-looking test data including: - Credential attempts - Attack detections (SQL injection, XSS, etc.) - Category behavior changes for timeline demonstration -- Real good crawler IPs (Googlebot, Bingbot, etc.) with API-fetched geolocation +- Geolocation data fetched from API with reverse geocoded city names +- Real good crawler IPs (Googlebot, Bingbot, etc.) Usage: python test_insert_fake_ips.py [num_ips] [logs_per_ip] [credentials_per_ip] [--no-cleanup] @@ -17,6 +18,8 @@ Examples: python test_insert_fake_ips.py 30 # Generate 30 IPs with defaults python test_insert_fake_ips.py 30 20 5 # Generate 30 IPs, 20 logs each, 5 credentials each python test_insert_fake_ips.py --no-cleanup # Generate data without cleaning DB first + +Note: This script will make API calls to fetch geolocation data, so it may take a while. """ import random @@ -32,86 +35,72 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from database import get_database from logger import get_app_logger +from geo_utils import extract_city_from_coordinates # ---------------------- # TEST DATA GENERATORS # ---------------------- -# Fake IPs with geolocation data (country_code, city, ASN org) -# These will appear on the map based on their country_code -FAKE_IPS_WITH_GEO = [ +# Fake IPs for testing - geolocation data will be fetched from API +# These are real public IPs from various locations around the world +FAKE_IPS = [ # United States - ("45.142.120.10", "US", "New York", "AS14061 DigitalOcean"), - ("107.189.10.143", "US", "Los Angeles", "AS20473 Vultr"), - ("162.243.175.23", "US", "San Francisco", "AS14061 DigitalOcean"), - ("198.51.100.89", "US", "Chicago", "AS16509 Amazon"), - + "45.142.120.10", + "107.189.10.143", + "162.243.175.23", + "198.51.100.89", # Europe - ("185.220.101.45", "DE", "Berlin", "AS24940 Hetzner"), - ("195.154.133.20", "FR", "Paris", "AS12876 Scaleway"), - ("178.128.83.165", "GB", "London", "AS14061 DigitalOcean"), - ("87.251.67.90", "NL", "Amsterdam", "AS49453 GlobalConnect"), - ("91.203.5.165", "RU", "Moscow", "AS51115 HLL LLC"), - ("46.105.57.169", "FR", "Roubaix", "AS16276 OVH"), - ("217.182.143.207", "RU", "Saint Petersburg", "AS51570 JSC ER-Telecom"), - ("188.166.123.45", "GB", "Manchester", "AS14061 DigitalOcean"), - + "185.220.101.45", + "195.154.133.20", + "178.128.83.165", + "87.251.67.90", + "91.203.5.165", + "46.105.57.169", + "217.182.143.207", + "188.166.123.45", # Asia - ("103.253.145.36", "CN", "Beijing", "AS4134 Chinanet"), - ("42.112.28.216", "CN", "Shanghai", "AS4134 Chinanet"), - ("118.163.74.160", "JP", "Tokyo", "AS2516 KDDI"), - ("43.229.53.35", "SG", "Singapore", "AS23969 TOT"), - ("115.78.208.140", "IN", "Mumbai", "AS9829 BSNL"), - ("14.139.56.18", "IN", "Bangalore", "AS4755 TATA"), - ("61.19.25.207", "TW", "Taipei", "AS3462 HiNet"), - ("121.126.219.198", "KR", "Seoul", "AS4766 Korea Telecom"), - ("202.134.4.212", "ID", "Jakarta", "AS7597 TELKOMNET"), - ("171.244.140.134", "VN", "Hanoi", "AS7552 Viettel"), - + "103.253.145.36", + "42.112.28.216", + "118.163.74.160", + "43.229.53.35", + "115.78.208.140", + "14.139.56.18", + "61.19.25.207", + "121.126.219.198", + "202.134.4.212", + "171.244.140.134", # South America - ("177.87.169.20", "BR", "São Paulo", "AS28573 Claro"), - ("200.21.19.58", "BR", "Rio de Janeiro", "AS7738 Telemar"), - ("181.13.140.98", "AR", "Buenos Aires", "AS7303 Telecom Argentina"), - ("190.150.24.34", "CO", "Bogotá", "AS3816 Colombia Telecomunicaciones"), - + "177.87.169.20", + "200.21.19.58", + "181.13.140.98", + "190.150.24.34", # Middle East & Africa - ("41.223.53.141", "EG", "Cairo", "AS8452 TE-Data"), - ("196.207.35.152", "ZA", "Johannesburg", "AS37271 Workonline"), - ("5.188.62.214", "TR", "Istanbul", "AS51115 HLL LLC"), - ("37.48.93.125", "AE", "Dubai", "AS5384 Emirates Telecom"), - ("102.66.137.29", "NG", "Lagos", "AS29465 MTN Nigeria"), - + "41.223.53.141", + "196.207.35.152", + "5.188.62.214", + "37.48.93.125", + "102.66.137.29", # Australia & Oceania - ("103.28.248.110", "AU", "Sydney", "AS4739 Internode"), - ("202.168.45.33", "AU", "Melbourne", "AS1221 Telstra"), - + "103.28.248.110", + "202.168.45.33", # Additional European IPs - ("94.102.49.190", "PL", "Warsaw", "AS12912 T-Mobile"), - ("213.32.93.140", "ES", "Madrid", "AS3352 Telefónica"), - ("79.137.79.167", "IT", "Rome", "AS3269 Telecom Italia"), - ("37.9.169.146", "SE", "Stockholm", "AS3301 Telia"), - ("188.92.80.123", "RO", "Bucharest", "AS8708 RCS & RDS"), - ("80.240.25.198", "CZ", "Prague", "AS6830 UPC"), + "94.102.49.190", + "213.32.93.140", + "79.137.79.167", + "37.9.169.146", + "188.92.80.123", + "80.240.25.198", ] -# Extract just IPs for backward compatibility -FAKE_IPS = [ip_data[0] for ip_data in FAKE_IPS_WITH_GEO] - -# Create geo data dictionary -FAKE_GEO_DATA = { - ip_data[0]: (ip_data[1], ip_data[2], ip_data[3]) - for ip_data in FAKE_IPS_WITH_GEO -} - # Real good crawler IPs (Googlebot, Bingbot, etc.) - geolocation will be fetched from API GOOD_CRAWLER_IPS = [ - "66.249.66.1", # Googlebot - "66.249.79.23", # Googlebot - "40.77.167.52", # Bingbot - "157.55.39.145", # Bingbot - "17.58.98.100", # Applebot - "199.59.150.39", # Twitterbot - "54.236.1.15", # Amazon Bot + "66.249.66.1", # Googlebot + "66.249.79.23", # Googlebot + "40.77.167.52", # Bingbot + "157.55.39.145", # Bingbot + "17.58.98.100", # Applebot + "199.59.150.39", # Twitterbot + "54.236.1.15", # Amazon Bot ] FAKE_PATHS = [ @@ -198,7 +187,13 @@ def cleanup_database(db_manager, app_logger): db_manager: Database manager instance app_logger: Logger instance """ - from models import AccessLog, CredentialAttempt, AttackDetection, IpStats, CategoryHistory + from models import ( + AccessLog, + CredentialAttempt, + AttackDetection, + IpStats, + CategoryHistory, + ) app_logger.info("=" * 60) app_logger.info("Cleaning up existing database data") @@ -232,6 +227,7 @@ def cleanup_database(db_manager, app_logger): def fetch_geolocation_from_api(ip: str, app_logger) -> tuple: """ Fetch geolocation data from the IP reputation API. + Uses the most recent result and extracts city from coordinates. Args: ip: IP address to lookup @@ -249,13 +245,18 @@ def fetch_geolocation_from_api(ip: str, app_logger) -> tuple: if response.status_code == 200: payload = response.json() if payload.get("results"): - data = payload["results"][0] - geoip_data = data.get("geoip_data", {}) + results = payload["results"] - country_code = geoip_data.get("country_iso_code", "Unknown") - city = geoip_data.get("city_name", "Unknown") + # Get the most recent result (first in list, sorted by record_added) + most_recent = results[0] + geoip_data = most_recent.get("geoip_data", {}) + + country_code = geoip_data.get("country_iso_code") asn = geoip_data.get("asn_autonomous_system_number") - asn_org = geoip_data.get("asn_autonomous_system_organization", "Unknown") + asn_org = geoip_data.get("asn_autonomous_system_organization") + + # Extract city from coordinates using reverse geocoding + city = extract_city_from_coordinates(geoip_data) return (country_code, city, asn, asn_org) except requests.RequestException as e: @@ -266,7 +267,13 @@ def fetch_geolocation_from_api(ip: str, app_logger) -> tuple: return None -def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per_ip: int = 3, include_good_crawlers: bool = True, cleanup: bool = True): +def generate_fake_data( + num_ips: int = 20, + logs_per_ip: int = 15, + credentials_per_ip: int = 3, + include_good_crawlers: bool = True, + cleanup: bool = True, +): """ Generate and insert fake test data into the database. @@ -308,8 +315,12 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per for _ in range(logs_per_ip): path = random.choice(FAKE_PATHS) user_agent = random.choice(FAKE_USER_AGENTS) - is_suspicious = random.choice([True, False, False]) # 33% chance of suspicious - is_honeypot = random.choice([True, False, False, False]) # 25% chance of honeypot trigger + is_suspicious = random.choice( + [True, False, False] + ) # 33% chance of suspicious + is_honeypot = random.choice( + [True, False, False, False] + ) # 25% chance of honeypot trigger # Randomly decide if this log has attack detections attack_types = None @@ -350,39 +361,45 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per app_logger.info(f" ✓ Generated {logs_per_ip} access logs") app_logger.info(f" ✓ Generated {credentials_per_ip} credential attempts") - # Add geolocation data if available for this IP - if ip in FAKE_GEO_DATA: - country_code, city, asn_org = FAKE_GEO_DATA[ip] - # Extract ASN number from ASN string (e.g., "AS12345 Name" -> 12345) - asn_number = None - if asn_org and asn_org.startswith("AS"): - try: - asn_number = int(asn_org.split()[0][2:]) # Remove "AS" prefix and get number - except (ValueError, IndexError): - asn_number = 12345 # Fallback + # Fetch geolocation data from API + app_logger.info(f" 🌍 Fetching geolocation from API...") + geo_data = fetch_geolocation_from_api(ip, app_logger) - # Update IP reputation info including geolocation and city + if geo_data: + country_code, city, asn, asn_org = geo_data db_manager.update_ip_rep_infos( ip=ip, country_code=country_code, - asn=asn_number or 12345, - asn_org=asn_org, + asn=asn if asn else 12345, + asn_org=asn_org or "Unknown", list_on={}, - city=city # Now passing city to the function + city=city, ) - app_logger.info(f" 📍 Added geolocation: {city}, {country_code} ({asn_org})") + location_display = ( + f"{city}, {country_code}" if city else country_code or "Unknown" + ) + app_logger.info( + f" 📍 API-fetched geolocation: {location_display} ({asn_org or 'Unknown'})" + ) + else: + app_logger.warning(f" ⚠ Could not fetch geolocation for {ip}") + + # Small delay to be nice to the API + time.sleep(0.5) # Trigger behavior/category changes to demonstrate timeline feature # First analysis initial_category = random.choice(CATEGORIES) - app_logger.info(f" ⟳ Analyzing behavior - Initial category: {initial_category}") - + app_logger.info( + f" ⟳ Analyzing behavior - Initial category: {initial_category}" + ) + db_manager.update_ip_stats_analysis( ip=ip, analyzed_metrics=generate_analyzed_metrics(), category=initial_category, category_scores=generate_category_scores(), - last_analysis=datetime.now(tz=ZoneInfo('UTC')) + last_analysis=datetime.now(tz=ZoneInfo("UTC")), ) total_category_changes += 1 @@ -391,30 +408,38 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per # Second analysis with potential category change (70% chance) if random.random() < 0.7: - new_category = random.choice([c for c in CATEGORIES if c != initial_category]) - app_logger.info(f" ⟳ Behavior change detected: {initial_category} → {new_category}") - + new_category = random.choice( + [c for c in CATEGORIES if c != initial_category] + ) + app_logger.info( + f" ⟳ Behavior change detected: {initial_category} → {new_category}" + ) + db_manager.update_ip_stats_analysis( ip=ip, analyzed_metrics=generate_analyzed_metrics(), category=new_category, category_scores=generate_category_scores(), - last_analysis=datetime.now(tz=ZoneInfo('UTC')) + last_analysis=datetime.now(tz=ZoneInfo("UTC")), ) total_category_changes += 1 # Optional third change (40% chance) if random.random() < 0.4: - final_category = random.choice([c for c in CATEGORIES if c != new_category]) - app_logger.info(f" ⟳ Another behavior change: {new_category} → {final_category}") - + final_category = random.choice( + [c for c in CATEGORIES if c != new_category] + ) + app_logger.info( + f" ⟳ Another behavior change: {new_category} → {final_category}" + ) + time.sleep(0.1) db_manager.update_ip_stats_analysis( ip=ip, analyzed_metrics=generate_analyzed_metrics(), category=final_category, category_scores=generate_category_scores(), - last_analysis=datetime.now(tz=ZoneInfo('UTC')) + last_analysis=datetime.now(tz=ZoneInfo("UTC")), ) total_category_changes += 1 @@ -433,7 +458,9 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per # Don't generate access logs for good crawlers to prevent re-categorization # We'll just create the IP stats entry with the category set - app_logger.info(f" ✓ Adding as good crawler (no logs to prevent re-categorization)") + app_logger.info( + f" ✓ Adding as good crawler (no logs to prevent re-categorization)" + ) # First, we need to create the IP in the database via persist_access # (but we'll only create one minimal log entry) @@ -456,9 +483,11 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per asn=asn if asn else 12345, asn_org=asn_org, list_on={}, - city=city + city=city, + ) + app_logger.info( + f" 📍 API-fetched geolocation: {city}, {country_code} ({asn_org})" ) - app_logger.info(f" 📍 API-fetched geolocation: {city}, {country_code} ({asn_org})") else: app_logger.warning(f" ⚠ Could not fetch geolocation for {crawler_ip}") @@ -479,7 +508,7 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per "regular_user": 0, "unknown": 0, }, - last_analysis=datetime.now(tz=ZoneInfo('UTC')) + last_analysis=datetime.now(tz=ZoneInfo("UTC")), ) total_good_crawlers += 1 time.sleep(0.5) # Small delay between API calls @@ -497,8 +526,12 @@ def generate_fake_data(num_ips: int = 20, logs_per_ip: int = 15, credentials_per app_logger.info(f"Total category changes: {total_category_changes}") app_logger.info("=" * 60) app_logger.info("\nYou can now view the dashboard with this test data.") - app_logger.info("The 'Behavior Timeline' will show category transitions for each IP.") - app_logger.info("The map will show good crawlers with real geolocation from API.") + app_logger.info( + "The 'Behavior Timeline' will show category transitions for each IP." + ) + app_logger.info( + "All IPs have API-fetched geolocation with reverse geocoded city names." + ) app_logger.info("Run: python server.py") app_logger.info("=" * 60) @@ -513,4 +546,10 @@ if __name__ == "__main__": # Add --no-cleanup flag to skip database cleanup cleanup = "--no-cleanup" not in sys.argv - generate_fake_data(num_ips, logs_per_ip, credentials_per_ip, include_good_crawlers=True, cleanup=cleanup) + generate_fake_data( + num_ips, + logs_per_ip, + credentials_per_ip, + include_good_crawlers=True, + cleanup=cleanup, + )