import os import sys # Add the script's directory to the Python path to ensure modules can be found SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, SCRIPT_DIR) import requests import json import psycopg2 import logging import re from datetime import datetime, timezone, timedelta from urllib.parse import urlparse from requests.packages.urllib3.exceptions import InsecureRequestWarning # Import the helper module for auto-repair import get_rpc_config_auto # Import provider classes from providers.base import BaseCountyProvider from providers.kubra import KubraCountyProvider from providers.simple import SimpleCountyJsonProvider from providers.nisc import NiscCountyProvider from providers.gwt_rpc import GwtRpcCountyProvider requests.packages.urllib3.disable_warnings(InsecureRequestWarning) # --- LOGGING --- logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.FileHandler(os.path.join(SCRIPT_DIR, 'newpower_county.log')), logging.StreamHandler()] ) logger = logging.getLogger(__name__) # --- CONFIG --- DB_CONFIG = {'host': 'localhost', 'database': 'nws', 'user': 'nws', 'password': 'nws'} CONFIG_FILE = os.path.join(SCRIPT_DIR, 'providers.json') AUTO_UPDATE_COOLDOWN_HOURS = 4 # Only try to repair once every 4 hours # --- CONFIG MANAGEMENT --- def load_providers(): if not os.path.exists(CONFIG_FILE): logger.error(f"{CONFIG_FILE} not found!") return [] with open(CONFIG_FILE, 'r') as f: # Filter for providers that have 'county_url' or 'county_meta_url' all_providers = json.load(f) return [p for p in all_providers if p.get('county_type')] def save_providers(providers): with open(CONFIG_FILE, 'w') as f: json.dump(providers, f, indent=4, default=str) logger.info("Configuration saved to providers.json") def update_provider_config(provider_name, new_settings): """Updates a specific provider in the JSON file safely""" # This needs to read the raw file, not the filtered one with open(CONFIG_FILE, 'r') as f: providers = json.load(f) updated = False for p in providers: if p.get('name') == provider_name: for key in ['headers', 'body', 'url', 'cookies', 'user_agent']: if key in new_settings: p[key] = new_settings[key] p['last_auto_update'] = datetime.now(timezone.utc).isoformat() updated = True break if updated: save_providers(providers) return True return False # --- DATABASE --- class CountyPowerDB: def __init__(self, config): self.conn = psycopg2.connect(**config) self.conn.autocommit = True def close(self): self.conn.close() def upsert_and_zero_outages(self, company_name, outage_data, fetch_time): """ Atomically updates outage information for a given company. 1. UPSERTS counties with active outages, updating their counts. 2. SETS outage count to 0 for any other county from that company that was not in the active list. """ # Prepare data for counties with active outages active_outage_values = [] reported_counties = [] for item in outage_data: if all(k in item for k in ['county', 'state', 'company']): val = ( item['county'], item['state'], item['company'], item.get('outages'), item.get('served'), fetch_time ) active_outage_values.append(val) reported_counties.append(item['county']) with self.conn.cursor() as cursor: # Step 1: UPSERT active outages if active_outage_values: upsert_sql = """ INSERT INTO newcountyoutages (county, state, company, outages, served, fetch_time) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (county, state, company) DO UPDATE SET outages = EXCLUDED.outages, served = COALESCE(EXCLUDED.served, newcountyoutages.served), fetch_time = EXCLUDED.fetch_time; """ cursor.executemany(upsert_sql, active_outage_values) logger.info(f"Upserted {len(active_outage_values)} active outage records for {company_name}.") # Step 2: Set outages to 0 for any other county from this company # This correctly creates point-in-time zero records by updating the fetch_time. zero_out_sql = """ UPDATE newcountyoutages SET outages = 0, fetch_time = %s WHERE company = %s AND county NOT IN %s; """ # Ensure reported_counties is not empty to avoid "IN (NULL)" if not reported_counties: reported_counties.append("NO_COUNTIES_REPORTED_DUMMY_VALUE") cursor.execute(zero_out_sql, (fetch_time, company_name, tuple(reported_counties))) logger.info(f"Zeroed out {cursor.rowcount} resolved outage records for {company_name}.") def run_post_processing(self): logger.info("Running post-processing for county data...") with self.conn.cursor() as cursor: cursor.execute(""" UPDATE newcountyoutages SET cwa = c.cwa FROM public.county c WHERE c.countyname = newcountyoutages.county AND c.state = newcountyoutages.state AND newcountyoutages.cwa IS NULL """) cursor.execute("DELETE FROM newcountyoutages WHERE fetch_time < NOW() - INTERVAL '365 days'") logger.info("County post-processing complete.") # --- PROVIDERS --- # --- REGISTRY --- PROVIDER_REGISTRY = { 'kubra_county': KubraCountyProvider, 'simple_county_json': SimpleCountyJsonProvider, 'nisc_hosted_county': NiscCountyProvider, 'gwt_rpc_county': GwtRpcCountyProvider, } # --- MAIN --- def main(): S = requests.Session() S.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}) db = CountyPowerDB(DB_CONFIG) run_timestamp = datetime.now(timezone.utc) logger.info("Starting County Power Scraper...") providers = load_providers() for config in providers: p_type = config.get('county_type') p_name = config.get('name') ProviderClass = PROVIDER_REGISTRY.get(p_type) if ProviderClass: try: provider = ProviderClass(config, S) logger.info(f"Fetching county data for {p_name}...") outages = provider.fetch() logger.info(f"Found {len(outages)} active outage records for {p_name}.") # Process this company's data in a single transaction db.upsert_and_zero_outages(p_name, outages, run_timestamp) except Exception as e: logger.error(f"Error processing {p_name}: {e}") elif p_type: logger.warning(f"Unknown provider type '{p_type}' for {p_name}") db.run_post_processing() db.close() logger.info("County scraping complete.") if __name__ == "__main__": main()