Files
test/newpower2.py
2025-12-10 01:45:38 +00:00

188 lines
7.0 KiB
Python

import os
import sys
# Add the script's directory to the Python path to ensure modules can be found
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, SCRIPT_DIR)
import requests
import polyline
import json
import psycopg2
import mercantile
import logging
import re
from datetime import datetime, timezone, timedelta
from urllib.parse import urlparse
from pyproj import Transformer
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# Import the helper module for auto-repair
import get_rpc_config_auto
# Import provider classes
from providers.base import BaseProvider
from providers.kubra import KubraProvider
from providers.simple import SimpleJsonProvider
from providers.gwt_rpc import GwtRpcProvider
from providers.nisc import NiscHostedProvider
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# --- LOGGING ---
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[logging.FileHandler(os.path.join(SCRIPT_DIR, 'power2_new.log')), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)
# --- CONFIG ---
DB_CONFIG = {'host': 'localhost', 'database': 'nws', 'user': 'nws', 'password': 'nws'}
CONFIG_FILE = os.path.join(SCRIPT_DIR, 'providers.json')
AUTO_UPDATE_COOLDOWN_HOURS = 4 # Only try to repair once every 4 hours
# --- CONFIG MANAGEMENT ---
def load_providers():
if not os.path.exists(CONFIG_FILE):
logger.error(f"{CONFIG_FILE} not found!")
return []
with open(CONFIG_FILE, 'r') as f:
# Filter for providers that have a 'type' for point-based scraping
all_providers = json.load(f)
return [p for p in all_providers if p.get('type')]
def save_providers(providers):
with open(CONFIG_FILE, 'w') as f:
json.dump(providers, f, indent=4)
logger.info("Configuration saved to providers.json")
def update_provider_config(provider_name, new_settings):
"""Updates a specific provider in the JSON file safely"""
providers = load_providers()
updated = False
for p in providers:
if p.get('name') == provider_name:
# Update all relevant fields
for key in ['headers', 'body', 'url', 'cookies', 'user_agent']:
if key in new_settings:
p[key] = new_settings[key]
p['last_auto_update'] = datetime.now(timezone.utc).isoformat()
updated = True
break
if updated:
save_providers(providers)
return True
return False
# --- HELPERS ---
def remove_external_curly_braces(s):
"""Extracts the first element if 's' is a list, otherwise returns 's'."""
if isinstance(s, list) and s:
return s[0]
# This regex handles cases like "{...}" which were seen in older data.
if isinstance(s, str) and s.startswith('{') and s.endswith('}'):
return s[1:-1]
return s
# --- DATABASE ---
class PowerDB:
def __init__(self, config):
self.conn = psycopg2.connect(**config)
self.conn.autocommit = True
def close(self):
self.conn.close()
def upsert_outage(self, data):
sql = """
INSERT INTO newpower
(incidentid, utility, lat, lon, pointgeom, areageom, start_time, etr,
outagen, peakoutage, cause, crew_status, active, last_change, fetch_time, geom)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_SetSRID(ST_MakePoint(%s, %s), 4326))
ON CONFLICT (pointgeom) DO UPDATE SET
outagen = EXCLUDED.outagen,
peakoutage = GREATEST(newpower.peakoutage, EXCLUDED.outagen),
cause = EXCLUDED.cause,
etr = EXCLUDED.etr,
crew_status = EXCLUDED.crew_status,
last_change = EXCLUDED.last_change,
fetch_time = EXCLUDED.fetch_time,
active = TRUE
"""
peak = data.get('outagen', 0)
# Clean areageom before insertion, referencing old power2.py logic
areageom = remove_external_curly_braces(data.get('areageom'))
params = (
data.get('incidentid'), data.get('utility'), data.get('lat'), data.get('lon'),
data.get('pointgeom'), areageom, data.get('start'), data.get('etr'),
data.get('outagen'), peak, data.get('cause'), data.get('crew_status'),
True, data.get('last_change', datetime.now(timezone.utc)), datetime.now(timezone.utc),
data.get('lon'), data.get('lat')
)
with self.conn.cursor() as cursor:
cursor.execute(sql, params)
def run_post_processing(self):
logger.info("Running post-processing...")
with self.conn.cursor() as cursor:
cursor.execute('UPDATE newpower SET county = c.countyname FROM public.county c WHERE ST_Contains(c.geom, newpower.geom) AND newpower.county IS NULL')
cursor.execute('UPDATE newpower SET state = c.state FROM public.county c WHERE ST_Contains(c.geom, newpower.geom) AND newpower.state IS NULL')
cursor.execute('UPDATE newpower SET cwa = f.cwa FROM public.fzone f WHERE ST_Contains(f.geom, newpower.geom) AND newpower.cwa IS NULL')
cursor.execute('UPDATE newpower SET realareageom = ST_LineFromEncodedPolyline(areageom) WHERE areageom IS NOT NULL AND realareageom IS NULL')
cursor.execute("UPDATE newpower SET active = TRUE WHERE fetch_time > NOW() - INTERVAL '30 minutes'")
cursor.execute("UPDATE newpower SET active = FALSE WHERE fetch_time < NOW() - INTERVAL '30 minutes'")
cursor.execute("DELETE FROM newpower WHERE fetch_time < NOW() - INTERVAL '365 days'")
logger.info("Post-processing complete.")
# --- REGISTRY ---
PROVIDER_REGISTRY = {
'kubra': KubraProvider,
'simple_json': SimpleJsonProvider,
'gwt_rpc': GwtRpcProvider,
'nisc_hosted': NiscHostedProvider,
}
# --- MAIN ---
# --- MAIN (Point Scraper) ---
def main():
S = requests.Session()
S.verify = False
db = PowerDB(DB_CONFIG)
logger.info("Starting Power Scraper...")
providers = load_providers()
for config in providers:
p_type = config.get('type')
p_name = config.get('name')
ProviderClass = PROVIDER_REGISTRY.get(p_type)
if ProviderClass:
try:
provider = ProviderClass(config, S)
logger.info(f"Fetching {p_name}...")
outages = provider.fetch()
count = 0
for outage in outages:
if outage:
db.upsert_outage(outage)
count += 1
logger.info(f"Saved {count} records for {p_name}")
except Exception as e:
logger.error(f"Error processing {p_name}: {e}")
else:
logger.warning(f"Unknown provider type {p_type} for {p_name}")
db.run_post_processing()
db.close()
logger.info("Scraping complete.")
if __name__ == "__main__":
main()