Files
test/news.py
2025-11-27 22:25:36 +00:00

427 lines
19 KiB
Python

import requests
import json
from datetime import datetime, timezone
import re
from bs4 import BeautifulSoup
import psycopg2
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
# --- Configuration & Setup ---
# --- Database Connection ---
try:
conn = psycopg2.connect(
host='localhost',
database='nws',
user='nws',
password='nws'
)
cursor = conn.cursor()
except psycopg2.OperationalError as e:
print(f"FATAL: Could not connect to the database. Error: {e}")
exit()
S = requests.Session()
S.headers.update({'User-Agent': 'NWS Charleston News Aggregator Bot/1.0'})
# --- NLTK Downloader ---
try:
stopwords.words('english')
nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
print("Downloading NLTK data (stopwords, punkt)...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
print("Download complete.")
# --- START: Original Scoring Components ---
WEATHER_KEYWORDS = {
'weather', 'flood', 'fire', 'fog', 'snow', 'emergency', 'wind', 'ice', 'rain',
'power', 'explosion', 'warmer', 'colder', 'drown', 'stream', 'river', 'air',
'destroyed', 'rime', 'glaze', 'creek', 'crash', 'thunder', 'spinup', 'black ice',
'spill', 'pileup', 'pile-up', 'gust', 'frozen', 'funnel', 'rainfall', 'fatal',
'injury', 'sleet', 'injured', 'frost', 'dead', 'death', 'landslide', 'culvert',
'slippery', 'wildfire', 'tornado', 'blizzard', 'hail', 'thunderstorm',
'downburst', 'microburst', 'heatstroke', 'derecho', 'lightning', 'hypothermia',
'slide', 'flow', 'ski', 'water', 'inundation', 'victim', 'victims', 'flooding',
'flooded','snowing','freezing rain','clouds','cloud','storm'
}
NEGATIVE_KEYWORDS = {
'tennessee', 'frankfurt', 'louisville', 'pennsylvania', 'johnson co',
'taylor swift', 'pittsburgh'
}
def calculate_relevance(headline, summary):
headline_str = (headline or "").lower()
summary_str = (summary or "").lower()
text = headline_str + " " + summary_str
if not text.strip(): return 0.0
tokens = word_tokenize(text)
if any(token in NEGATIVE_KEYWORDS for token in tokens): return 0.0
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words and word.isalnum()]
if not filtered_tokens: return 0.0
matches = sum(1 for word in filtered_tokens if word in WEATHER_KEYWORDS)
score = (matches / len(filtered_tokens)) * 2.0
return round(min(1.0, score), 2)
# --- END: Original Scoring Components ---
# --- Advanced Scorer Keywords ---
IMPACT_KEYWORDS = {
# High Impact (Catastrophic, Life-Threatening)
'fatal': 25, 'fatality': 25, 'drown': 25, 'death': 25, 'killed': 25,
'tornado': 120, 'derecho': 20, 'landslide': 50, 'mudslide': 50, 'evacuation': 100,
'explosion': 100, 'spill': 15, 'derailment': 55, 'emergency': 15, 'injured': 10,
'injury': 10, 'crash': 10, 'pileup': 10, 'pile-up': 100, 'flood': 40,
'flooding': 10, 'inundation': 10, 'rescue': 10, 'wildfire': 40, 'water rescue': 100,
'dam failure': 100, 'water rescue': 100, 'dam break': 100,
# New High Impact Suggestions
'deceased': 25, 'victim': 25, 'tragedy': 25, # Synonyms for fatality
'avalanche': 20, 'mudflow': 80, 'debris flow': 80, 'rockslide': 80, # Earth movement
'levee break': 80, 'dam break': 120, 'levee breach': 100, # Water infrastructure failure
'state of emergency': 20, 'disaster': 20, 'shelter': 15, # Official declarations & response
'hazmat': 95, 'hazardous material': 55, # Secondary emergencies
'hypothermia': 15, 'heat stroke': 55, 'heat injury': 55, # Direct human health impacts
'structure collapse': 100, 'building collapse': 100, 'roof collapse': 35, # Damage
# Medium Impact (Significant Disruption & Damage)
'thunderstorm': 40, 't-storm': 8, 'lightning': 80, 'hail': 80, 'power outage': 8,
'outage': 8, 'downed trees': 50, 'tree down': 50, 'road closure': 7, 'closed': 7,
'stuck': 7, 'stranded': 7, 'high water': 7, 'wind': 7, 'gust': 7, 'blizzard': 7,
# New Medium Impact Suggestions
'microburst': 50, 'downburst': 50, 'wind damage': 50, # Damaging winds
'power lines down': 80, 'utility pole': 50, 'transformer fire': 8, # Power infrastructure
'washout': 8, 'washed out': 8, 'submerged': 8, # Flooding impacts
'accident': 7, 'wreck': 7, 'rollover': 7, # Transportation
'roadblock': 7, 'detour': 7, 'traffic': 7, # Transportation flow
'damage': 7, 'damaged': 7, # General damage term
'funnel cloud': 99, 'waterspout': 99, # Tornadic precursors
'storm surge': 9, 'coastal flood': 9, 'high surf': 8, 'rip current': 8, # Coastal impacts
'dust storm': 8, 'haboob': 8, # Dust/visibility (regional)
# Situational Awareness (Large outdoor events sensitive to weather)
'festival': 5, 'marathon': 5, 'state fair': 5, 'parade': 5,
# New Situational Awareness Suggestions
'concert': 5, 'game': 5, 'stadium': 5, 'county fair': 5, # Events
'regatta': 5, 'triathlon': 5, 'outdoor event': 5, 'fireworks': 5, # Events
# Low Impact / Hazardous Conditions (Note: some scores are high due to potential)
'rain': 5, 'snow': 15, 'sleet': 45, 'ice': 35, 'frost': 10,
'storm': 5, 'slippery': 35, 'slide': 5, 'flow': 2,
# New Low Impact / Hazardous Conditions Suggestions
'freezing rain': 55, 'black ice': 55, 'icy roads': 55, # High-impact ice
'fog': 25, 'low visibility': 15, 'whiteout': 55, 'blowing snow': 12, # Visibility hazards
'heat wave': 32, 'excessive heat': 32, 'heat index': 10, 'wind chill': 10, # Temp hazards
'heavy rain': 35, 'downpour': 35, 'heavy snow': 35, # Intense precip
'slick': 35, 'treacherous': 55, # Road conditions
'advisory': 6, 'warning': 8, 'watch': 7, # NWS products
'i-64': 10, 'i-77': 10, 'i-79': 10, 'route 60': 10, 'corridor g': 10,
'turnpike': 10, 'kanawha river': 15, 'ohio river': 15, 'elk river': 15,
'mud river': 15, 'guyandotte': 15, 'gauley': 15, 'coal river': 15,
}
GEOGRAPHIC_KEYWORDS = {
# WV Counties
'barbour', 'boone', 'braxton', 'cabell', 'calhoun', 'clay', 'doddridge', 'gilmer', 'harrison',
'jackson', 'kanawha', 'lewis', 'lincoln', 'logan', 'mason', 'mingo', 'pleasants', 'putnam',
'ritchie', 'roane', 'taylor', 'tyler', 'upshur', 'wayne', 'webster', 'wirt', 'wood', 'wyoming',
# KY Counties
'boyd', 'carter', 'greenup', 'lawrence',
# OH Counties
'athens', 'gallia', 'jackson', 'lawrence', 'meigs', 'morgan', 'perry', 'vinton',
'washington',
# VA Counties
'buchanan', 'dickenson',
# Major Cities (Original)
'charleston', 'huntington', 'parkersburg', 'clarksburg', 'ashland', 'marietta', 'portsmouth',
# --- NEW ADDITIONS ---
# West Virginia Cities & Towns
'philippi', 'madison', 'sutton', 'gassaway', 'barboursville', 'milton', 'grantsville', 'beckley',
'west union', 'glenville', 'bridgeport', 'shinnston', 'salem', 'ripley', 'ravenswood',
'south charleston', 'st. albans', 'dunbar', 'nitro', 'teays valley', 'hurricane',
'weston', 'hamlin', 'logan', 'chapmanville', 'point pleasant', 'williamson', 'st. marys',
'winfield', 'harrisville', 'pennsboro', 'spencer', 'grafton', 'middlebourne', 'sistersville',
'buckhannon', 'wayne', 'kenova', 'ceredo', 'webster springs', 'elizabeth', 'vienna',
'williamstown', 'pineville', 'mullens',
# Ohio Cities & Towns
'belpre', 'nelsonville', 'gallipolis', 'wellston', 'oak hill', 'ironton', 'south point',
'pomeroy', 'middleport', 'mcconnelsville', 'new lexington', 'somerset', 'mcarthur', 'vienna',
# Kentucky Cities & Towns
'catlettsburg', 'grayson', 'olive hill', 'russell', 'louisa',
# Virginia Towns
'grundy', 'clintwood'
}
# --- MODIFICATION: Renamed and added your requested keywords ---
PENALTY_KEYWORDS = {
# Out-of-area locations
'pittsburgh': -50, 'columbus': -25, 'cincinnati': -50, 'lexington': -25, 'texas': -50,
'tennessee': -100, 'pennsylvania': -50,
'morgantown': -40, 'bluefield': -40, 'wheeling': -50, # In WV, but outside RLX CWA
'roanoke': -50, 'blacksburg': -50, # In VA, but outside RLX CWA
'cleveland': -75, 'dayton': -50, 'akron': -60, # Other OH cities
'louisville': -50, # Other KY cities
'knoxville': -75, 'bristol': -50, # TN cities
'north carolina': -100, 'maryland': -100, 'Gatlinburg': -50,
# Low-impact events
'homecoming': -90, 'back to school': -40, 'dance': -20, 'book fair': -10, 'classes': -10, 'taylor swift': -100, 'trump': -50,
}
def calculate_impact_score(headline, summary):
headline_str = (headline or "").lower()
summary_str = (summary or "").lower()
# Combine text but keep a raw version for accurate indexing
text = f"{headline_str} {summary_str}"
if not text.strip(): return 0
score = 0
geo_relevance_found = False
# 0. Negation Set
# If these words appear immediately before a keyword, ignore the keyword
negators = {'no', 'not', 'unlikely', 'false', 'zero', 'minimal', 'avoided'}
# 1. Geographic Bonus (Unchanged but efficient)
if any(area in text for area in GEOGRAPHIC_KEYWORDS):
score += 50
geo_relevance_found = True
# 2. Impact Keyword Scores with Negation Check
tokens = word_tokenize(text) # Use NLTK tokens for positional accuracy
for keyword, value in IMPACT_KEYWORDS.items():
# Handle multi-word keywords (e.g., "dam break") separately
if " " in keyword:
if keyword in text:
score += value
# Handle single-word keywords with negation check
elif keyword in tokens:
# Find index of keyword
indices = [i for i, x in enumerate(tokens) if x == keyword]
for i in indices:
# Check previous 2 words for negators
prev_words = set(tokens[max(0, i-2):i])
if not prev_words.intersection(negators):
score += value
# Headline Bonus
if keyword in headline_str:
score += (value * 0.5)
break # Only count the keyword once per article to prevent inflation
# 3. Compound Impact Bonus (New Feature)
# Weather impacts rarely happen in isolation.
# If we see multiple categories, boost the score.
found_keywords = [k for k in IMPACT_KEYWORDS if k in text]
if 'wind' in str(found_keywords) and 'outage' in str(found_keywords):
score += 30 # Confidence boost: Wind causing outages
if 'rain' in str(found_keywords) and 'slide' in str(found_keywords):
score += 50 # Confidence boost: Rain causing mudslides
# 4. Penalty Logic (Refined)
for keyword, penalty in PENALTY_KEYWORDS.items():
if keyword in text:
if not geo_relevance_found and keyword in ['pittsburgh', 'columbus', 'tennessee']:
score += (penalty * 2.0) # Double penalty for out-of-area specifics
else:
score += penalty
return max(0, int(score))
def standardize_time(time_string):
if not time_string: return None
time_string_cleaned = re.sub(r'(\.\d+)', '', str(time_string).strip())
try:
dt = datetime.fromisoformat(time_string_cleaned.replace('Z', '+00:00'))
return dt.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M")
except (ValueError, TypeError):
return None
# --- News Source Scraping Functions (Unchanged) ---
def get_json(url):
try:
response = S.get(url, timeout=10)
response.raise_for_status()
return response.json()
except (requests.RequestException, json.JSONDecodeError) as e:
print(f"Error fetching or parsing JSON from {url}: {e}")
return None
def trendingstories(url, extras):
data = get_json(url + extras)
if not data: return
for item in data.get('teasers', []):
story_path = item.get('url')
if not story_path: continue
stories.append([
item.get('title'), item.get('summary'), url + story_path,
item.get('updatedDateISO8601'),
url + item.get('heroImageUrl') if item.get('heroImageUrl') else None, url
])
def lakana(url, extras):
data = get_json(url + extras)
if not data: return
# Helper to handle the API returning either a List [] or a Dict {}
def extract_articles(section_data):
if isinstance(section_data, dict):
# Normal case: it's a dict containing an "articles" key
return section_data.get('articles', [])
elif isinstance(section_data, list):
# Edge case: it's already a list of articles (or empty list)
return section_data
return []
# Safely extract from both sections
# one of these is likely a list, which was breaking the old code
articles = extract_articles(data.get('additional_top_stories')) + \
extract_articles(data.get('top_stories'))
for item in articles:
# Ensure we are looking at a dictionary item
if not isinstance(item, dict): continue
# Safely extract nested fields
date_val = item.get('date')
date_str = date_val.get('datetime') if isinstance(date_val, dict) else str(date_val)
thumb_val = item.get('thumbnail')
thumb_src = thumb_val.get('src') if isinstance(thumb_val, dict) else None
stories.append([
item.get('home_page_title') or item.get('title'),
item.get('title'),
item.get('link'),
date_str,
thumb_src,
url
])
def lakanatitle(url, extras):
data = get_json(url + extras)
if not data: return
articles = data.get('additional_top_stories', {}).get('articles', []) + \
data.get('top_stories', {}).get('articles', [])
for item in articles:
stories.append([
item.get('title'), item.get('title'), item.get('link'),
item.get('date', {}).get('datetime'), item.get('thumbnail', {}).get('src'), url
])
def fusion(url, extras):
try:
html_content = S.get(url, timeout=10).text
except requests.RequestException as e:
print(f"Error fetching HTML from {url}: {e}")
return
soup = BeautifulSoup(html_content, 'html.parser')
script_tag = soup.find('script', id='fusion-metadata')
if script_tag and script_tag.string:
match = re.search(r'Fusion\.globalContent\s*=\s*(\{.*?\});', script_tag.string, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
for item in data.get('content_elements', []):
try:
website_url_path = item.get('website_url')
if not website_url_path: continue
stories.append([
item.get('headlines', {}).get('basic'),
item.get('description', {}).get('basic'),
url + website_url_path,
item.get('display_date'),
item.get('promo_items', {}).get('basic', {}).get('url'), url
])
except (KeyError, TypeError):
continue
except json.JSONDecodeError:
print(f"Failed to parse Fusion JSON from {url}")
# --- End of Scraping Functions ---
stories = []
newsarray = [
["https://wsaz.com", "", "fusion"], ["https://wtap.com", "", "fusion"],
["https://wdtv.com", "", "fusion"], ["https://wymt.com", "", "fusion"],
["https://wboy.com", "/wp-json/lakana/v1/template-variables/", "lakana"],
["https://wowktv.com", "/wp-json/lakana/v1/template-variables/", "lakanatitle"],
["https://wchstv.com", "/api/rest/audience/trending-stories?count=25", "trendingstories"],
["https://wvva.com", "", "fusion"],
["https://wjhl.com", "/wp-json/lakana/v1/template-variables/", "lakana"],
["https://wcyb.com", "/api/rest/audience/trending-stories?count=25", "trendingstories"],
["https://wvnstv.com", "/wp-json/lakana/v1/template-variables/", "lakanatitle"],
]
def runmedia():
for source_info in newsarray:
url, extra, func_name = source_info
try:
print(f"Fetching from: {url}")
func = globals()[func_name]
func(url, extra)
except Exception as e:
print(f"Failed to process {url}. Error: {e}")
def process_and_insert_stories():
print(f"\nCollected {len(stories)} stories. Processing and inserting...")
inserted_count = 0
for story_data in stories:
try:
if len(story_data) != 6: continue
headline, summary, link, dt_str, img_url, source = story_data
timeutc = standardize_time(dt_str)
if not timeutc or not link or not headline: continue
impact_score = calculate_impact_score(headline, summary)
nlpscore = calculate_relevance(headline, summary)
print(f" - Impact[{impact_score}] NLP[{nlpscore}]: {headline}")
sql = """
INSERT INTO news (headline, summary, storylink, updated, source, imageurl, timeutc, impact_score, nlpscore)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (storylink) DO UPDATE SET
(headline, summary, updated, imageurl, timeutc, impact_score, nlpscore) =
(EXCLUDED.headline, EXCLUDED.summary, EXCLUDED.updated, EXCLUDED.imageurl,
EXCLUDED.timeutc, EXCLUDED.impact_score, EXCLUDED.nlpscore);
"""
params = (headline, summary, link, timeutc, source, img_url, timeutc, impact_score, nlpscore)
cursor.execute(sql, params)
inserted_count += 1
except Exception as e:
print(f"!!! DATABASE ERROR processing story row: {story_data}. Error: {e}")
continue
conn.commit()
print(f"\nDatabase insertion/update complete. {inserted_count} stories were processed and inserted/updated.")
def cleandb():
print("Cleaning duplicate summaries from the database...")
sql = """
DELETE FROM news a USING news b
WHERE a.id < b.id AND a.summary = b.summary;
"""
cursor.execute(sql)
conn.commit()
print(f"Cleaned {cursor.rowcount} duplicate entries.")
if __name__ == "__main__":
runmedia()
process_and_insert_stories()
cleandb()
cursor.close()
conn.close()
print("Process finished and connection closed.")