427 lines
19 KiB
Python
427 lines
19 KiB
Python
import requests
|
|
import json
|
|
from datetime import datetime, timezone
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
import psycopg2
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.corpus import stopwords
|
|
import nltk
|
|
|
|
# --- Configuration & Setup ---
|
|
|
|
# --- Database Connection ---
|
|
try:
|
|
conn = psycopg2.connect(
|
|
host='localhost',
|
|
database='nws',
|
|
user='nws',
|
|
password='nws'
|
|
)
|
|
cursor = conn.cursor()
|
|
except psycopg2.OperationalError as e:
|
|
print(f"FATAL: Could not connect to the database. Error: {e}")
|
|
exit()
|
|
|
|
S = requests.Session()
|
|
S.headers.update({'User-Agent': 'NWS Charleston News Aggregator Bot/1.0'})
|
|
|
|
# --- NLTK Downloader ---
|
|
try:
|
|
stopwords.words('english')
|
|
nltk.data.find('tokenizers/punkt')
|
|
except nltk.downloader.DownloadError:
|
|
print("Downloading NLTK data (stopwords, punkt)...")
|
|
nltk.download('stopwords', quiet=True)
|
|
nltk.download('punkt', quiet=True)
|
|
print("Download complete.")
|
|
|
|
# --- START: Original Scoring Components ---
|
|
WEATHER_KEYWORDS = {
|
|
'weather', 'flood', 'fire', 'fog', 'snow', 'emergency', 'wind', 'ice', 'rain',
|
|
'power', 'explosion', 'warmer', 'colder', 'drown', 'stream', 'river', 'air',
|
|
'destroyed', 'rime', 'glaze', 'creek', 'crash', 'thunder', 'spinup', 'black ice',
|
|
'spill', 'pileup', 'pile-up', 'gust', 'frozen', 'funnel', 'rainfall', 'fatal',
|
|
'injury', 'sleet', 'injured', 'frost', 'dead', 'death', 'landslide', 'culvert',
|
|
'slippery', 'wildfire', 'tornado', 'blizzard', 'hail', 'thunderstorm',
|
|
'downburst', 'microburst', 'heatstroke', 'derecho', 'lightning', 'hypothermia',
|
|
'slide', 'flow', 'ski', 'water', 'inundation', 'victim', 'victims', 'flooding',
|
|
'flooded','snowing','freezing rain','clouds','cloud','storm'
|
|
}
|
|
NEGATIVE_KEYWORDS = {
|
|
'tennessee', 'frankfurt', 'louisville', 'pennsylvania', 'johnson co',
|
|
'taylor swift', 'pittsburgh'
|
|
}
|
|
|
|
def calculate_relevance(headline, summary):
|
|
headline_str = (headline or "").lower()
|
|
summary_str = (summary or "").lower()
|
|
text = headline_str + " " + summary_str
|
|
if not text.strip(): return 0.0
|
|
tokens = word_tokenize(text)
|
|
if any(token in NEGATIVE_KEYWORDS for token in tokens): return 0.0
|
|
stop_words = set(stopwords.words('english'))
|
|
filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
|
|
if not filtered_tokens: return 0.0
|
|
matches = sum(1 for word in filtered_tokens if word in WEATHER_KEYWORDS)
|
|
score = (matches / len(filtered_tokens)) * 2.0
|
|
return round(min(1.0, score), 2)
|
|
|
|
# --- END: Original Scoring Components ---
|
|
|
|
|
|
# --- Advanced Scorer Keywords ---
|
|
IMPACT_KEYWORDS = {
|
|
# High Impact (Catastrophic, Life-Threatening)
|
|
'fatal': 25, 'fatality': 25, 'drown': 25, 'death': 25, 'killed': 25,
|
|
'tornado': 120, 'derecho': 20, 'landslide': 50, 'mudslide': 50, 'evacuation': 100,
|
|
'explosion': 100, 'spill': 15, 'derailment': 55, 'emergency': 15, 'injured': 10,
|
|
'injury': 10, 'crash': 10, 'pileup': 10, 'pile-up': 100, 'flood': 40,
|
|
'flooding': 10, 'inundation': 10, 'rescue': 10, 'wildfire': 40, 'water rescue': 100,
|
|
'dam failure': 100, 'water rescue': 100, 'dam break': 100,
|
|
# New High Impact Suggestions
|
|
'deceased': 25, 'victim': 25, 'tragedy': 25, # Synonyms for fatality
|
|
'avalanche': 20, 'mudflow': 80, 'debris flow': 80, 'rockslide': 80, # Earth movement
|
|
'levee break': 80, 'dam break': 120, 'levee breach': 100, # Water infrastructure failure
|
|
'state of emergency': 20, 'disaster': 20, 'shelter': 15, # Official declarations & response
|
|
'hazmat': 95, 'hazardous material': 55, # Secondary emergencies
|
|
'hypothermia': 15, 'heat stroke': 55, 'heat injury': 55, # Direct human health impacts
|
|
'structure collapse': 100, 'building collapse': 100, 'roof collapse': 35, # Damage
|
|
|
|
# Medium Impact (Significant Disruption & Damage)
|
|
'thunderstorm': 40, 't-storm': 8, 'lightning': 80, 'hail': 80, 'power outage': 8,
|
|
'outage': 8, 'downed trees': 50, 'tree down': 50, 'road closure': 7, 'closed': 7,
|
|
'stuck': 7, 'stranded': 7, 'high water': 7, 'wind': 7, 'gust': 7, 'blizzard': 7,
|
|
# New Medium Impact Suggestions
|
|
'microburst': 50, 'downburst': 50, 'wind damage': 50, # Damaging winds
|
|
'power lines down': 80, 'utility pole': 50, 'transformer fire': 8, # Power infrastructure
|
|
'washout': 8, 'washed out': 8, 'submerged': 8, # Flooding impacts
|
|
'accident': 7, 'wreck': 7, 'rollover': 7, # Transportation
|
|
'roadblock': 7, 'detour': 7, 'traffic': 7, # Transportation flow
|
|
'damage': 7, 'damaged': 7, # General damage term
|
|
'funnel cloud': 99, 'waterspout': 99, # Tornadic precursors
|
|
'storm surge': 9, 'coastal flood': 9, 'high surf': 8, 'rip current': 8, # Coastal impacts
|
|
'dust storm': 8, 'haboob': 8, # Dust/visibility (regional)
|
|
|
|
# Situational Awareness (Large outdoor events sensitive to weather)
|
|
'festival': 5, 'marathon': 5, 'state fair': 5, 'parade': 5,
|
|
# New Situational Awareness Suggestions
|
|
'concert': 5, 'game': 5, 'stadium': 5, 'county fair': 5, # Events
|
|
'regatta': 5, 'triathlon': 5, 'outdoor event': 5, 'fireworks': 5, # Events
|
|
|
|
# Low Impact / Hazardous Conditions (Note: some scores are high due to potential)
|
|
'rain': 5, 'snow': 15, 'sleet': 45, 'ice': 35, 'frost': 10,
|
|
'storm': 5, 'slippery': 35, 'slide': 5, 'flow': 2,
|
|
# New Low Impact / Hazardous Conditions Suggestions
|
|
'freezing rain': 55, 'black ice': 55, 'icy roads': 55, # High-impact ice
|
|
'fog': 25, 'low visibility': 15, 'whiteout': 55, 'blowing snow': 12, # Visibility hazards
|
|
'heat wave': 32, 'excessive heat': 32, 'heat index': 10, 'wind chill': 10, # Temp hazards
|
|
'heavy rain': 35, 'downpour': 35, 'heavy snow': 35, # Intense precip
|
|
'slick': 35, 'treacherous': 55, # Road conditions
|
|
'advisory': 6, 'warning': 8, 'watch': 7, # NWS products
|
|
'i-64': 10, 'i-77': 10, 'i-79': 10, 'route 60': 10, 'corridor g': 10,
|
|
'turnpike': 10, 'kanawha river': 15, 'ohio river': 15, 'elk river': 15,
|
|
'mud river': 15, 'guyandotte': 15, 'gauley': 15, 'coal river': 15,
|
|
}
|
|
|
|
GEOGRAPHIC_KEYWORDS = {
|
|
# WV Counties
|
|
'barbour', 'boone', 'braxton', 'cabell', 'calhoun', 'clay', 'doddridge', 'gilmer', 'harrison',
|
|
'jackson', 'kanawha', 'lewis', 'lincoln', 'logan', 'mason', 'mingo', 'pleasants', 'putnam',
|
|
'ritchie', 'roane', 'taylor', 'tyler', 'upshur', 'wayne', 'webster', 'wirt', 'wood', 'wyoming',
|
|
# KY Counties
|
|
'boyd', 'carter', 'greenup', 'lawrence',
|
|
# OH Counties
|
|
'athens', 'gallia', 'jackson', 'lawrence', 'meigs', 'morgan', 'perry', 'vinton',
|
|
'washington',
|
|
# VA Counties
|
|
'buchanan', 'dickenson',
|
|
|
|
# Major Cities (Original)
|
|
'charleston', 'huntington', 'parkersburg', 'clarksburg', 'ashland', 'marietta', 'portsmouth',
|
|
|
|
# --- NEW ADDITIONS ---
|
|
# West Virginia Cities & Towns
|
|
'philippi', 'madison', 'sutton', 'gassaway', 'barboursville', 'milton', 'grantsville', 'beckley',
|
|
'west union', 'glenville', 'bridgeport', 'shinnston', 'salem', 'ripley', 'ravenswood',
|
|
'south charleston', 'st. albans', 'dunbar', 'nitro', 'teays valley', 'hurricane',
|
|
'weston', 'hamlin', 'logan', 'chapmanville', 'point pleasant', 'williamson', 'st. marys',
|
|
'winfield', 'harrisville', 'pennsboro', 'spencer', 'grafton', 'middlebourne', 'sistersville',
|
|
'buckhannon', 'wayne', 'kenova', 'ceredo', 'webster springs', 'elizabeth', 'vienna',
|
|
'williamstown', 'pineville', 'mullens',
|
|
|
|
# Ohio Cities & Towns
|
|
'belpre', 'nelsonville', 'gallipolis', 'wellston', 'oak hill', 'ironton', 'south point',
|
|
'pomeroy', 'middleport', 'mcconnelsville', 'new lexington', 'somerset', 'mcarthur', 'vienna',
|
|
|
|
# Kentucky Cities & Towns
|
|
'catlettsburg', 'grayson', 'olive hill', 'russell', 'louisa',
|
|
|
|
# Virginia Towns
|
|
'grundy', 'clintwood'
|
|
}
|
|
|
|
# --- MODIFICATION: Renamed and added your requested keywords ---
|
|
PENALTY_KEYWORDS = {
|
|
# Out-of-area locations
|
|
'pittsburgh': -50, 'columbus': -25, 'cincinnati': -50, 'lexington': -25, 'texas': -50,
|
|
'tennessee': -100, 'pennsylvania': -50,
|
|
'morgantown': -40, 'bluefield': -40, 'wheeling': -50, # In WV, but outside RLX CWA
|
|
'roanoke': -50, 'blacksburg': -50, # In VA, but outside RLX CWA
|
|
'cleveland': -75, 'dayton': -50, 'akron': -60, # Other OH cities
|
|
'louisville': -50, # Other KY cities
|
|
'knoxville': -75, 'bristol': -50, # TN cities
|
|
'north carolina': -100, 'maryland': -100, 'Gatlinburg': -50,
|
|
# Low-impact events
|
|
'homecoming': -90, 'back to school': -40, 'dance': -20, 'book fair': -10, 'classes': -10, 'taylor swift': -100, 'trump': -50,
|
|
}
|
|
|
|
def calculate_impact_score(headline, summary):
|
|
headline_str = (headline or "").lower()
|
|
summary_str = (summary or "").lower()
|
|
# Combine text but keep a raw version for accurate indexing
|
|
text = f"{headline_str} {summary_str}"
|
|
|
|
if not text.strip(): return 0
|
|
|
|
score = 0
|
|
geo_relevance_found = False
|
|
|
|
# 0. Negation Set
|
|
# If these words appear immediately before a keyword, ignore the keyword
|
|
negators = {'no', 'not', 'unlikely', 'false', 'zero', 'minimal', 'avoided'}
|
|
|
|
# 1. Geographic Bonus (Unchanged but efficient)
|
|
if any(area in text for area in GEOGRAPHIC_KEYWORDS):
|
|
score += 50
|
|
geo_relevance_found = True
|
|
|
|
# 2. Impact Keyword Scores with Negation Check
|
|
tokens = word_tokenize(text) # Use NLTK tokens for positional accuracy
|
|
|
|
for keyword, value in IMPACT_KEYWORDS.items():
|
|
# Handle multi-word keywords (e.g., "dam break") separately
|
|
if " " in keyword:
|
|
if keyword in text:
|
|
score += value
|
|
# Handle single-word keywords with negation check
|
|
elif keyword in tokens:
|
|
# Find index of keyword
|
|
indices = [i for i, x in enumerate(tokens) if x == keyword]
|
|
for i in indices:
|
|
# Check previous 2 words for negators
|
|
prev_words = set(tokens[max(0, i-2):i])
|
|
if not prev_words.intersection(negators):
|
|
score += value
|
|
# Headline Bonus
|
|
if keyword in headline_str:
|
|
score += (value * 0.5)
|
|
break # Only count the keyword once per article to prevent inflation
|
|
|
|
# 3. Compound Impact Bonus (New Feature)
|
|
# Weather impacts rarely happen in isolation.
|
|
# If we see multiple categories, boost the score.
|
|
found_keywords = [k for k in IMPACT_KEYWORDS if k in text]
|
|
if 'wind' in str(found_keywords) and 'outage' in str(found_keywords):
|
|
score += 30 # Confidence boost: Wind causing outages
|
|
if 'rain' in str(found_keywords) and 'slide' in str(found_keywords):
|
|
score += 50 # Confidence boost: Rain causing mudslides
|
|
|
|
# 4. Penalty Logic (Refined)
|
|
for keyword, penalty in PENALTY_KEYWORDS.items():
|
|
if keyword in text:
|
|
if not geo_relevance_found and keyword in ['pittsburgh', 'columbus', 'tennessee']:
|
|
score += (penalty * 2.0) # Double penalty for out-of-area specifics
|
|
else:
|
|
score += penalty
|
|
|
|
return max(0, int(score))
|
|
|
|
|
|
def standardize_time(time_string):
|
|
if not time_string: return None
|
|
time_string_cleaned = re.sub(r'(\.\d+)', '', str(time_string).strip())
|
|
try:
|
|
dt = datetime.fromisoformat(time_string_cleaned.replace('Z', '+00:00'))
|
|
return dt.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M")
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
# --- News Source Scraping Functions (Unchanged) ---
|
|
def get_json(url):
|
|
try:
|
|
response = S.get(url, timeout=10)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except (requests.RequestException, json.JSONDecodeError) as e:
|
|
print(f"Error fetching or parsing JSON from {url}: {e}")
|
|
return None
|
|
|
|
def trendingstories(url, extras):
|
|
data = get_json(url + extras)
|
|
if not data: return
|
|
for item in data.get('teasers', []):
|
|
story_path = item.get('url')
|
|
if not story_path: continue
|
|
stories.append([
|
|
item.get('title'), item.get('summary'), url + story_path,
|
|
item.get('updatedDateISO8601'),
|
|
url + item.get('heroImageUrl') if item.get('heroImageUrl') else None, url
|
|
])
|
|
|
|
def lakana(url, extras):
|
|
data = get_json(url + extras)
|
|
if not data: return
|
|
|
|
# Helper to handle the API returning either a List [] or a Dict {}
|
|
def extract_articles(section_data):
|
|
if isinstance(section_data, dict):
|
|
# Normal case: it's a dict containing an "articles" key
|
|
return section_data.get('articles', [])
|
|
elif isinstance(section_data, list):
|
|
# Edge case: it's already a list of articles (or empty list)
|
|
return section_data
|
|
return []
|
|
|
|
# Safely extract from both sections
|
|
# one of these is likely a list, which was breaking the old code
|
|
articles = extract_articles(data.get('additional_top_stories')) + \
|
|
extract_articles(data.get('top_stories'))
|
|
|
|
for item in articles:
|
|
# Ensure we are looking at a dictionary item
|
|
if not isinstance(item, dict): continue
|
|
|
|
# Safely extract nested fields
|
|
date_val = item.get('date')
|
|
date_str = date_val.get('datetime') if isinstance(date_val, dict) else str(date_val)
|
|
|
|
thumb_val = item.get('thumbnail')
|
|
thumb_src = thumb_val.get('src') if isinstance(thumb_val, dict) else None
|
|
|
|
stories.append([
|
|
item.get('home_page_title') or item.get('title'),
|
|
item.get('title'),
|
|
item.get('link'),
|
|
date_str,
|
|
thumb_src,
|
|
url
|
|
])
|
|
|
|
def lakanatitle(url, extras):
|
|
data = get_json(url + extras)
|
|
if not data: return
|
|
articles = data.get('additional_top_stories', {}).get('articles', []) + \
|
|
data.get('top_stories', {}).get('articles', [])
|
|
for item in articles:
|
|
stories.append([
|
|
item.get('title'), item.get('title'), item.get('link'),
|
|
item.get('date', {}).get('datetime'), item.get('thumbnail', {}).get('src'), url
|
|
])
|
|
|
|
def fusion(url, extras):
|
|
try:
|
|
html_content = S.get(url, timeout=10).text
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching HTML from {url}: {e}")
|
|
return
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
script_tag = soup.find('script', id='fusion-metadata')
|
|
if script_tag and script_tag.string:
|
|
match = re.search(r'Fusion\.globalContent\s*=\s*(\{.*?\});', script_tag.string, re.DOTALL)
|
|
if match:
|
|
try:
|
|
data = json.loads(match.group(1))
|
|
for item in data.get('content_elements', []):
|
|
try:
|
|
website_url_path = item.get('website_url')
|
|
if not website_url_path: continue
|
|
stories.append([
|
|
item.get('headlines', {}).get('basic'),
|
|
item.get('description', {}).get('basic'),
|
|
url + website_url_path,
|
|
item.get('display_date'),
|
|
item.get('promo_items', {}).get('basic', {}).get('url'), url
|
|
])
|
|
except (KeyError, TypeError):
|
|
continue
|
|
except json.JSONDecodeError:
|
|
print(f"Failed to parse Fusion JSON from {url}")
|
|
# --- End of Scraping Functions ---
|
|
|
|
stories = []
|
|
newsarray = [
|
|
["https://wsaz.com", "", "fusion"], ["https://wtap.com", "", "fusion"],
|
|
["https://wdtv.com", "", "fusion"], ["https://wymt.com", "", "fusion"],
|
|
["https://wboy.com", "/wp-json/lakana/v1/template-variables/", "lakana"],
|
|
["https://wowktv.com", "/wp-json/lakana/v1/template-variables/", "lakanatitle"],
|
|
["https://wchstv.com", "/api/rest/audience/trending-stories?count=25", "trendingstories"],
|
|
["https://wvva.com", "", "fusion"],
|
|
["https://wjhl.com", "/wp-json/lakana/v1/template-variables/", "lakana"],
|
|
["https://wcyb.com", "/api/rest/audience/trending-stories?count=25", "trendingstories"],
|
|
["https://wvnstv.com", "/wp-json/lakana/v1/template-variables/", "lakanatitle"],
|
|
]
|
|
|
|
def runmedia():
|
|
for source_info in newsarray:
|
|
url, extra, func_name = source_info
|
|
try:
|
|
print(f"Fetching from: {url}")
|
|
func = globals()[func_name]
|
|
func(url, extra)
|
|
except Exception as e:
|
|
print(f"Failed to process {url}. Error: {e}")
|
|
|
|
def process_and_insert_stories():
|
|
print(f"\nCollected {len(stories)} stories. Processing and inserting...")
|
|
inserted_count = 0
|
|
for story_data in stories:
|
|
try:
|
|
if len(story_data) != 6: continue
|
|
headline, summary, link, dt_str, img_url, source = story_data
|
|
timeutc = standardize_time(dt_str)
|
|
if not timeutc or not link or not headline: continue
|
|
|
|
impact_score = calculate_impact_score(headline, summary)
|
|
nlpscore = calculate_relevance(headline, summary)
|
|
|
|
print(f" - Impact[{impact_score}] NLP[{nlpscore}]: {headline}")
|
|
|
|
sql = """
|
|
INSERT INTO news (headline, summary, storylink, updated, source, imageurl, timeutc, impact_score, nlpscore)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (storylink) DO UPDATE SET
|
|
(headline, summary, updated, imageurl, timeutc, impact_score, nlpscore) =
|
|
(EXCLUDED.headline, EXCLUDED.summary, EXCLUDED.updated, EXCLUDED.imageurl,
|
|
EXCLUDED.timeutc, EXCLUDED.impact_score, EXCLUDED.nlpscore);
|
|
"""
|
|
|
|
params = (headline, summary, link, timeutc, source, img_url, timeutc, impact_score, nlpscore)
|
|
|
|
cursor.execute(sql, params)
|
|
inserted_count += 1
|
|
except Exception as e:
|
|
print(f"!!! DATABASE ERROR processing story row: {story_data}. Error: {e}")
|
|
continue
|
|
|
|
conn.commit()
|
|
print(f"\nDatabase insertion/update complete. {inserted_count} stories were processed and inserted/updated.")
|
|
|
|
def cleandb():
|
|
print("Cleaning duplicate summaries from the database...")
|
|
sql = """
|
|
DELETE FROM news a USING news b
|
|
WHERE a.id < b.id AND a.summary = b.summary;
|
|
"""
|
|
cursor.execute(sql)
|
|
conn.commit()
|
|
print(f"Cleaned {cursor.rowcount} duplicate entries.")
|
|
|
|
if __name__ == "__main__":
|
|
runmedia()
|
|
process_and_insert_stories()
|
|
cleandb()
|
|
cursor.close()
|
|
conn.close()
|
|
print("Process finished and connection closed.")
|