494 lines
20 KiB
Python
494 lines
20 KiB
Python
import requests
|
|
import polyline
|
|
import json
|
|
import psycopg2
|
|
import mercantile
|
|
import logging
|
|
import os
|
|
import re
|
|
from datetime import datetime, timezone, timedelta
|
|
from abc import ABC, abstractmethod
|
|
from urllib.parse import urlparse
|
|
from pyproj import Transformer
|
|
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
|
|
|
# Import the helper module for auto-repair
|
|
import get_rpc_config_auto
|
|
|
|
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
|
|
|
# --- LOGGING ---
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[logging.FileHandler('power2_new.log'), logging.StreamHandler()]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- CONFIG ---
|
|
DB_CONFIG = {'host': 'localhost', 'database': 'nws', 'user': 'nws', 'password': 'nws'}
|
|
CONFIG_FILE = 'providers.json'
|
|
AUTO_UPDATE_COOLDOWN_HOURS = 4 # Only try to repair once every 4 hours
|
|
|
|
# --- CONFIG MANAGEMENT ---
|
|
def load_providers():
|
|
if not os.path.exists(CONFIG_FILE):
|
|
logger.error(f"{CONFIG_FILE} not found!")
|
|
return []
|
|
with open(CONFIG_FILE, 'r') as f:
|
|
return json.load(f)
|
|
|
|
def save_providers(providers):
|
|
with open(CONFIG_FILE, 'w') as f:
|
|
json.dump(providers, f, indent=4)
|
|
logger.info("Configuration saved to providers.json")
|
|
|
|
def update_provider_config(provider_name, new_settings):
|
|
"""Updates a specific provider in the JSON file safely"""
|
|
providers = load_providers()
|
|
updated = False
|
|
for p in providers:
|
|
if p.get('name') == provider_name:
|
|
# Update all relevant fields
|
|
for key in ['headers', 'body', 'url', 'cookies', 'user_agent']:
|
|
if key in new_settings:
|
|
p[key] = new_settings[key]
|
|
|
|
p['last_auto_update'] = datetime.now(timezone.utc).isoformat()
|
|
updated = True
|
|
break
|
|
|
|
if updated:
|
|
save_providers(providers)
|
|
return True
|
|
return False
|
|
|
|
# --- DATABASE ---
|
|
class PowerDB:
|
|
def __init__(self, config):
|
|
self.conn = psycopg2.connect(**config)
|
|
self.conn.autocommit = True
|
|
|
|
def close(self):
|
|
self.conn.close()
|
|
|
|
def upsert_outage(self, data):
|
|
sql = """
|
|
INSERT INTO newpower
|
|
(incidentid, utility, lat, lon, pointgeom, areageom, start_time, etr,
|
|
outagen, peakoutage, cause, crew_status, active, last_change, fetch_time, geom)
|
|
VALUES
|
|
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_SetSRID(ST_MakePoint(%s, %s), 4326))
|
|
ON CONFLICT (pointgeom) DO UPDATE SET
|
|
outagen = EXCLUDED.outagen,
|
|
peakoutage = GREATEST(newpower.peakoutage, EXCLUDED.outagen),
|
|
cause = EXCLUDED.cause,
|
|
etr = EXCLUDED.etr,
|
|
crew_status = EXCLUDED.crew_status,
|
|
last_change = EXCLUDED.last_change,
|
|
fetch_time = EXCLUDED.fetch_time,
|
|
active = TRUE
|
|
"""
|
|
peak = data.get('outagen', 0)
|
|
params = (
|
|
data.get('incidentid'), data.get('utility'), data.get('lat'), data.get('lon'),
|
|
data.get('pointgeom'), data.get('areageom'), data.get('start'), data.get('etr'),
|
|
data.get('outagen'), peak, data.get('cause'), data.get('crew_status'),
|
|
True, data.get('last_change', datetime.now(timezone.utc)), datetime.now(timezone.utc),
|
|
data.get('lon'), data.get('lat')
|
|
)
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute(sql, params)
|
|
|
|
def run_post_processing(self):
|
|
logger.info("Running post-processing...")
|
|
with self.conn.cursor() as cursor:
|
|
cursor.execute('UPDATE newpower SET county = c.countyname FROM public.county c WHERE ST_Contains(c.geom, newpower.geom) AND newpower.county IS NULL')
|
|
cursor.execute('UPDATE newpower SET state = c.state FROM public.county c WHERE ST_Contains(c.geom, newpower.geom) AND newpower.state IS NULL')
|
|
cursor.execute('UPDATE newpower SET cwa = f.cwa FROM public.fzone f WHERE ST_Contains(f.geom, newpower.geom) AND newpower.cwa IS NULL')
|
|
cursor.execute('UPDATE newpower SET realareageom = ST_LineFromEncodedPolyline(areageom) WHERE areageom IS NOT NULL AND realareageom IS NULL')
|
|
cursor.execute("UPDATE newpower SET active = TRUE WHERE fetch_time > NOW() - INTERVAL '30 minutes'")
|
|
cursor.execute("UPDATE newpower SET active = FALSE WHERE fetch_time < NOW() - INTERVAL '30 minutes'")
|
|
cursor.execute("DELETE FROM newpower WHERE fetch_time < NOW() - INTERVAL '365 days'")
|
|
logger.info("Post-processing complete.")
|
|
|
|
# --- PROVIDERS ---
|
|
|
|
class BaseProvider(ABC):
|
|
def __init__(self, config, session):
|
|
self.config = config
|
|
self.session = session
|
|
self.name = config.get('name', 'Unknown')
|
|
|
|
@abstractmethod
|
|
def fetch(self):
|
|
pass
|
|
|
|
class SimpleJsonProvider(BaseProvider):
|
|
def fetch(self):
|
|
url = self.config.get('url')
|
|
try:
|
|
resp = self.session.get(url, verify=False)
|
|
if not resp.ok: return []
|
|
data = resp.json()
|
|
results = []
|
|
for item in data:
|
|
results.append(self._normalize(item))
|
|
return results
|
|
except Exception as e:
|
|
logger.error(f"Error fetching {self.name}: {e}")
|
|
return []
|
|
|
|
def _normalize(self, item):
|
|
def safe_parse(ts):
|
|
if not ts: return None
|
|
try: return datetime.fromisoformat(ts.replace('Z', '+00:00'))
|
|
except: return None
|
|
return {
|
|
'incidentid': str(item.get('outageRecID')), 'utility': self.name,
|
|
'lat': item.get('outagePoint', {}).get('lat'), 'lon': item.get('outagePoint', {}).get('lng'),
|
|
'pointgeom': f"{item.get('outagePoint', {}).get('lat')},{item.get('outagePoint', {}).get('lng')}",
|
|
'areageom': None, 'start': safe_parse(item.get('outageStartTime')),
|
|
'etr': safe_parse(item.get('outageEndTime')), 'outagen': item.get('customersOutNow'),
|
|
'cause': item.get('cause'), 'crew_status': item.get('outageWorkStatus'),
|
|
'last_change': safe_parse(item.get('outageModifiedTime'))
|
|
}
|
|
|
|
class KubraProvider(BaseProvider):
|
|
def __init__(self, config, session):
|
|
super().__init__(config, session)
|
|
self.max_zoom = 14
|
|
self.results = []
|
|
self.base_url_template = 'https://kubra.io/cluster-data/'
|
|
|
|
def fetch(self):
|
|
meta_url = self.config.get('meta_url')
|
|
if not meta_url: return []
|
|
|
|
hex1, hex2 = self._get_hexes(meta_url)
|
|
if not hex1: return []
|
|
|
|
self.base_url = f"{self.base_url_template}{hex1}/{hex2}/"
|
|
self.layer = self.config.get('layer')
|
|
quadkeys = self.config.get('quadkeys', [])
|
|
|
|
self.results = []
|
|
self._fetch_recursive(quadkeys, set(), zoom=len(quadkeys[0]))
|
|
return self.results
|
|
|
|
def _get_hexes(self, url):
|
|
try:
|
|
resp = self.session.get(url)
|
|
path = resp.json().get('data', {}).get('cluster_interval_generation_data')
|
|
parts = path.split('/')
|
|
return parts[2], parts[3]
|
|
except: return None, None
|
|
|
|
def _fetch_recursive(self, quadkeys, seen, zoom):
|
|
for q in quadkeys:
|
|
suffix = q[-3:][::-1]
|
|
url = f"{self.base_url}{suffix}/public/{self.layer}/{q}.json"
|
|
if url in seen: continue
|
|
seen.add(url)
|
|
try:
|
|
resp = self.session.get(url)
|
|
if not resp.ok: continue
|
|
for item in resp.json().get('file_data', []):
|
|
if item.get('desc', {}).get('cluster', False):
|
|
if zoom + 1 <= self.max_zoom:
|
|
p_geom = item.get('geom', {}).get('p', [])
|
|
if p_geom:
|
|
next_key = self._get_quadkey_for_point(p_geom[0], zoom + 1)
|
|
self._fetch_recursive([next_key], seen, zoom + 1)
|
|
else: self.results.append(self._normalize(item))
|
|
else:
|
|
self.results.append(self._normalize(item))
|
|
self._fetch_recursive(self._get_neighbors(q), seen, zoom)
|
|
except: pass
|
|
|
|
def _normalize(self, item):
|
|
desc = item.get('desc', {})
|
|
geom = item.get('geom', {})
|
|
p = geom.get('p', [None])[0]
|
|
if not p: return {}
|
|
latlon = polyline.decode(p)[0]
|
|
def ts(s):
|
|
if not s or s=='ETR-NULL': return None
|
|
try: return datetime.strptime(s, "%Y-%m-%dT%H:%M:%S%z")
|
|
except: return None
|
|
return {
|
|
'incidentid': desc.get('inc_id'), 'utility': self.name,
|
|
'lat': latlon[0], 'lon': latlon[1], 'pointgeom': p, 'areageom': geom.get('a'),
|
|
'start': ts(desc.get('start_time')), 'etr': ts(desc.get('etr')),
|
|
'outagen': desc.get('cust_a', {}).get('val', 0), 'cause': desc.get('cause', {}).get('EN-US', "Pending"),
|
|
'crew_status': desc.get('crew_status', {}).get('EN-US'), 'active': True
|
|
}
|
|
|
|
def _get_quadkey_for_point(self, p, z):
|
|
ll = polyline.decode(p)[0]
|
|
return mercantile.quadkey(mercantile.tile(lng=ll[1], lat=ll[0], zoom=z))
|
|
|
|
def _get_neighbors(self, q):
|
|
t = mercantile.quadkey_to_tile(q)
|
|
return [mercantile.quadkey(n) for n in mercantile.neighbors(t)]
|
|
|
|
class GwtRpcProvider(BaseProvider):
|
|
def __init__(self, config, session):
|
|
super().__init__(config, session)
|
|
self.transformer = None
|
|
self.state_filter = config.get('state_filter')
|
|
self.map_url = config.get('map_url')
|
|
|
|
# 1. Base Headers
|
|
self.session.headers.update({
|
|
'User-Agent': config.get('user_agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'),
|
|
'Accept': '*/*',
|
|
'Sec-Fetch-Site': 'same-origin'
|
|
})
|
|
|
|
parsed_url = urlparse(config.get('url'))
|
|
self.session.headers.update({'Origin': f"{parsed_url.scheme}://{parsed_url.netloc}"})
|
|
|
|
# 2. Load Cookies
|
|
if config.get('cookies'):
|
|
for cookie in config['cookies']:
|
|
try:
|
|
self.session.cookies.set(
|
|
cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path']
|
|
)
|
|
except: pass
|
|
|
|
self.STATE_BOUNDS = {
|
|
'WV': {'lat_min': 37.0, 'lat_max': 40.7, 'lon_min': -82.7, 'lon_max': -77.7},
|
|
'OH': {'lat_min': 38.4, 'lat_max': 42.0, 'lon_min': -84.9, 'lon_max': -80.5},
|
|
'KY': {'lat_min': 36.4, 'lat_max': 39.2, 'lon_min': -89.6, 'lon_max': -81.9},
|
|
'IA': {'lat_min': 40.3, 'lat_max': 43.6, 'lon_min': -96.7, 'lon_max': -90.1}
|
|
}
|
|
|
|
if config.get('epsg'):
|
|
try:
|
|
self.transformer = Transformer.from_crs(f"EPSG:{config['epsg']}", "EPSG:4326", always_xy=True)
|
|
except: logger.error(f"EPSG Error for {self.name}")
|
|
|
|
def attempt_auto_repair(self):
|
|
if not self.map_url: return False
|
|
|
|
# --- Cooldown Check ---
|
|
last_update = self.config.get('last_auto_update')
|
|
if last_update:
|
|
try:
|
|
last_dt = datetime.fromisoformat(last_update)
|
|
if last_dt.tzinfo is None: last_dt = last_dt.replace(tzinfo=timezone.utc)
|
|
if datetime.now(timezone.utc) - last_dt < timedelta(hours=AUTO_UPDATE_COOLDOWN_HOURS):
|
|
logger.info(f"Skipping auto-repair for {self.name} (Cooldown active).")
|
|
return False
|
|
except ValueError: pass
|
|
|
|
logger.info(f"Attempting Auto-Repair for {self.name}...")
|
|
|
|
try:
|
|
# Expecting 4 values: data, headers, cookies, body
|
|
new_data, valid_headers, valid_cookies, valid_body = get_rpc_config_auto.fetch_live_data(self.map_url)
|
|
|
|
if valid_headers and valid_body:
|
|
logger.info(f"Repair successful! Updating {self.name}.")
|
|
|
|
# Clean Headers (Blacklist approach)
|
|
excluded = {'content-length', 'host', 'connection', 'cookie', 'accept-encoding', 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform', 'origin'}
|
|
clean_headers = {k: v for k, v in valid_headers.items() if k.lower() not in excluded}
|
|
|
|
# Ensure Referer is set correctly for next time
|
|
clean_headers['Referer'] = self.map_url
|
|
|
|
# Update In-Memory Config
|
|
current_time = datetime.now(timezone.utc).isoformat()
|
|
self.config['headers'] = clean_headers
|
|
self.config['body'] = valid_body
|
|
self.config['cookies'] = valid_cookies
|
|
self.config['user_agent'] = valid_headers.get('user-agent')
|
|
self.config['last_auto_update'] = current_time
|
|
|
|
# Update Session
|
|
self.session.cookies.clear()
|
|
for cookie in valid_cookies:
|
|
self.session.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path'])
|
|
if valid_headers.get('user_agent'):
|
|
self.session.headers.update({'User-Agent': valid_headers.get('user-agent')})
|
|
|
|
# Save to Disk
|
|
new_settings = {
|
|
'headers': clean_headers,
|
|
'body': valid_body,
|
|
'cookies': valid_cookies,
|
|
'user_agent': valid_headers.get('user-agent')
|
|
}
|
|
update_provider_config(self.name, new_settings)
|
|
return True
|
|
except Exception as e:
|
|
logger.error(f"Auto-repair failed: {e}")
|
|
|
|
return False
|
|
|
|
def fetch(self, is_retry=False):
|
|
url = self.config.get('url')
|
|
headers = self.config.get('headers', {})
|
|
body = self.config.get('body')
|
|
|
|
if not url: return []
|
|
|
|
try:
|
|
# 3. Dynamic Origin Update
|
|
parsed_url = urlparse(url)
|
|
origin = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
|
|
|
# Priority: Configured Referer > Module Base > Origin
|
|
correct_referer = headers.get('Referer') or headers.get('x-gwt-module-base') or origin
|
|
|
|
ua = headers.get('User-Agent', self.session.headers['User-Agent'])
|
|
if "Headless" in ua: # Fallback safety
|
|
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
|
|
self.session.headers.update({
|
|
'Origin': origin,
|
|
'Referer': correct_referer,
|
|
'User-Agent': ua
|
|
})
|
|
|
|
# Prime if missing cookies or map_url is defined
|
|
if self.map_url and not self.config.get('cookies'):
|
|
try: self.session.get(correct_referer, verify=False, timeout=10)
|
|
except: pass
|
|
|
|
req_headers = headers.copy()
|
|
req_headers['Content-Type'] = 'text/x-gwt-rpc; charset=UTF-8'
|
|
req_headers['Referer'] = correct_referer
|
|
req_headers['User-Agent'] = ua
|
|
|
|
# Only fetch if we have a body
|
|
if body:
|
|
resp = self.session.post(url, headers=req_headers, data=body, verify=False)
|
|
else:
|
|
resp = type('obj', (object,), {'status_code': 500, 'text': 'No Body', 'ok': False})()
|
|
|
|
# 5. Error Handling & Retry
|
|
failed = False
|
|
if "//EX" in resp.text: failed = True
|
|
if resp.status_code == 500: failed = True
|
|
|
|
if failed:
|
|
logger.error(f"GWT Failure for {self.name} (Status: {resp.status_code}).")
|
|
|
|
if is_retry:
|
|
logger.error(f"Retry failed for {self.name}. Aborting.")
|
|
return []
|
|
|
|
if self.attempt_auto_repair():
|
|
logger.info("Retrying fetch with new settings...")
|
|
return self.fetch(is_retry=True)
|
|
else:
|
|
return []
|
|
|
|
if not resp.ok: return []
|
|
text = resp.text
|
|
if text.startswith('//OK'): text = text[4:]
|
|
return self._extract_outages(json.loads(text))
|
|
except Exception as e:
|
|
logger.error(f"Fetch error {self.name}: {e}")
|
|
return []
|
|
|
|
def _extract_outages(self, data_list):
|
|
results = []
|
|
if not self.transformer: return []
|
|
processed = set()
|
|
stride = 2
|
|
|
|
for i in range(len(data_list) - stride):
|
|
val1 = data_list[i]
|
|
val2 = data_list[i+stride]
|
|
|
|
if (isinstance(val1, (int, float)) and isinstance(val2, (int, float)) and
|
|
abs(val1) > 100000 and abs(val2) > 100000):
|
|
|
|
lat, lon = None, None
|
|
try:
|
|
res_lon, res_lat = self.transformer.transform(val2, val1)
|
|
if self._is_valid(res_lat, res_lon): lat, lon = res_lat, res_lon
|
|
except: pass
|
|
if not lat:
|
|
try:
|
|
res_lon, res_lat = self.transformer.transform(val1, val2)
|
|
if self._is_valid(res_lat, res_lon): lat, lon = res_lat, res_lon
|
|
except: pass
|
|
|
|
if lat and lon:
|
|
k = f"{lat:.4f},{lon:.4f}"
|
|
if k in processed: continue
|
|
processed.add(k)
|
|
|
|
oid = str(abs(hash(k)))
|
|
for o in range(1, 15):
|
|
idx = i - o
|
|
if idx >= 0 and isinstance(data_list[idx], str):
|
|
s = data_list[idx]
|
|
if len(s) < 20 and "java" not in s and "http" not in s: oid = s; break
|
|
|
|
results.append({
|
|
'incidentid': oid, 'utility': self.name,
|
|
'lat': lat, 'lon': lon, 'pointgeom': k, 'areageom': None,
|
|
'start': datetime.now(timezone.utc), 'etr': None, 'outagen': 1,
|
|
'cause': "Unknown", 'crew_status': "Unknown", 'active': True,
|
|
'last_change': datetime.now(timezone.utc)
|
|
})
|
|
return results
|
|
|
|
def _is_valid(self, lat, lon):
|
|
if not self.state_filter: return True
|
|
b = self.STATE_BOUNDS.get(self.state_filter)
|
|
if not b: return True
|
|
return b['lat_min'] <= lat <= b['lat_max'] and b['lon_min'] <= lon <= b['lon_max']
|
|
|
|
|
|
# --- REGISTRY ---
|
|
PROVIDER_REGISTRY = {
|
|
'kubra': KubraProvider,
|
|
'simple_json': SimpleJsonProvider,
|
|
'gwt_rpc': GwtRpcProvider
|
|
}
|
|
|
|
# --- MAIN ---
|
|
def main():
|
|
S = requests.Session()
|
|
S.verify = False
|
|
db = PowerDB(DB_CONFIG)
|
|
|
|
logger.info("Starting Power Scraper...")
|
|
providers = load_providers()
|
|
|
|
for config in providers:
|
|
p_type = config.get('type')
|
|
p_name = config.get('name')
|
|
|
|
ProviderClass = PROVIDER_REGISTRY.get(p_type)
|
|
if ProviderClass:
|
|
try:
|
|
provider = ProviderClass(config, S)
|
|
logger.info(f"Fetching {p_name}...")
|
|
outages = provider.fetch()
|
|
|
|
count = 0
|
|
for outage in outages:
|
|
if outage:
|
|
db.upsert_outage(outage)
|
|
count += 1
|
|
logger.info(f"Saved {count} records for {p_name}")
|
|
except Exception as e:
|
|
logger.error(f"Error processing {p_name}: {e}")
|
|
else:
|
|
logger.warning(f"Unknown provider type {p_type} for {p_name}")
|
|
|
|
db.run_post_processing()
|
|
db.close()
|
|
logger.info("Scraping complete.")
|
|
|
|
if __name__ == "__main__":
|
|
main() |