import os import sys # Add the script's directory to the Python path to ensure modules can be found sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import requests import polyline import json import psycopg2 import mercantile import logging import re from datetime import datetime, timezone, timedelta from urllib.parse import urlparse from pyproj import Transformer from requests.packages.urllib3.exceptions import InsecureRequestWarning # Import the helper module for auto-repair import get_rpc_config_auto # Import provider classes from base import BaseProvider from kubra import KubraProvider from simple import SimpleJsonProvider from gwt_rpc import GwtRpcProvider from nisc import NiscHostedProvider requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # --- LOGGING --- logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler('power2_new.log'), logging.StreamHandler()] ) logger = logging.getLogger(__name__) # --- CONFIG --- DB_CONFIG = {'host': 'localhost', 'database': 'nws', 'user': 'nws', 'password': 'nws'} CONFIG_FILE = 'providers.json' AUTO_UPDATE_COOLDOWN_HOURS = 4 # Only try to repair once every 4 hours # --- CONFIG MANAGEMENT --- def load_providers(): if not os.path.exists(CONFIG_FILE): logger.error(f"{CONFIG_FILE} not found!") return [] with open(CONFIG_FILE, 'r') as f: # Filter for providers that have a 'type' for point-based scraping all_providers = json.load(f) return [p for p in all_providers if p.get('type')] def save_providers(providers): with open(CONFIG_FILE, 'w') as f: json.dump(providers, f, indent=4) logger.info("Configuration saved to providers.json") def update_provider_config(provider_name, new_settings): """Updates a specific provider in the JSON file safely""" providers = load_providers() updated = False for p in providers: if p.get('name') == provider_name: # Update all relevant fields for key in ['headers', 'body', 'url', 'cookies', 'user_agent']: if key in new_settings: p[key] = new_settings[key] p['last_auto_update'] = datetime.now(timezone.utc).isoformat() updated = True break if updated: save_providers(providers) return True return False # --- DATABASE --- class PowerDB: def __init__(self, config): self.conn = psycopg2.connect(**config) self.conn.autocommit = True def close(self): self.conn.close() def upsert_outage(self, data): sql = """ INSERT INTO newpower (incidentid, utility, lat, lon, pointgeom, areageom, start_time, etr, outagen, peakoutage, cause, crew_status, active, last_change, fetch_time, geom) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_SetSRID(ST_MakePoint(%s, %s), 4326)) ON CONFLICT (pointgeom) DO UPDATE SET outagen = EXCLUDED.outagen, peakoutage = GREATEST(newpower.peakoutage, EXCLUDED.outagen), cause = EXCLUDED.cause, etr = EXCLUDED.etr, crew_status = EXCLUDED.crew_status, last_change = EXCLUDED.last_change, fetch_time = EXCLUDED.fetch_time, active = TRUE """ peak = data.get('outagen', 0) params = ( data.get('incidentid'), data.get('utility'), data.get('lat'), data.get('lon'), data.get('pointgeom'), data.get('areageom'), data.get('start'), data.get('etr'), data.get('outagen'), peak, data.get('cause'), data.get('crew_status'), True, data.get('last_change', datetime.now(timezone.utc)), datetime.now(timezone.utc), data.get('lon'), data.get('lat') ) with self.conn.cursor() as cursor: cursor.execute(sql, params) def run_post_processing(self): logger.info("Running post-processing...") with self.conn.cursor() as cursor: cursor.execute('UPDATE newpower SET county = c.countyname FROM public.county c WHERE ST_Contains(c.geom, newpower.geom) AND newpower.county IS NULL') cursor.execute('UPDATE newpower SET state = c.state FROM public.county c WHERE ST_Contains(c.geom, newpower.geom) AND newpower.state IS NULL') cursor.execute('UPDATE newpower SET cwa = f.cwa FROM public.fzone f WHERE ST_Contains(f.geom, newpower.geom) AND newpower.cwa IS NULL') cursor.execute('UPDATE newpower SET realareageom = ST_LineFromEncodedPolyline(areageom) WHERE areageom IS NOT NULL AND realareageom IS NULL') cursor.execute("UPDATE newpower SET active = TRUE WHERE fetch_time > NOW() - INTERVAL '30 minutes'") cursor.execute("UPDATE newpower SET active = FALSE WHERE fetch_time < NOW() - INTERVAL '30 minutes'") cursor.execute("DELETE FROM newpower WHERE fetch_time < NOW() - INTERVAL '365 days'") logger.info("Post-processing complete.") # --- REGISTRY --- PROVIDER_REGISTRY = { 'kubra': KubraProvider, 'simple_json': SimpleJsonProvider, 'gwt_rpc': GwtRpcProvider, 'nisc_hosted': NiscHostedProvider, } # --- MAIN --- # --- MAIN (Point Scraper) --- def main(): S = requests.Session() S.verify = False db = PowerDB(DB_CONFIG) logger.info("Starting Power Scraper...") providers = load_providers() for config in providers: p_type = config.get('type') p_name = config.get('name') ProviderClass = PROVIDER_REGISTRY.get(p_type) if ProviderClass: try: provider = ProviderClass(config, S) logger.info(f"Fetching {p_name}...") outages = provider.fetch() count = 0 for outage in outages: if outage: db.upsert_outage(outage) count += 1 logger.info(f"Saved {count} records for {p_name}") except Exception as e: logger.error(f"Error processing {p_name}: {e}") else: logger.warning(f"Unknown provider type {p_type} for {p_name}") db.run_post_processing() db.close() logger.info("Scraping complete.") if __name__ == "__main__": main()