Spaces:

Divyonko
/

LivePulse

Running

App Files Files Community

DivYonko commited on 9 days ago

Commit

146e596

1 Parent(s): eede559

feat: replace pytchat with YouTube Data API v3 scraper

Browse files

Files changed (2) hide show

app.py +113 -39
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -102,53 +102,127 @@ def _safe_topic(text: str):
         return "General", 0.50
-def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Event) -> None:
-    """Background thread that scrapes live chat and writes to in-memory store."""
-    import pytchat
-    logger.info("Scraper thread starting — video=%s key=%s", video_id, redis_key)
     try:
-        chat = pytchat.create(video_id=video_id, interruptable=False)
     except Exception as exc:
-        logger.error("pytchat.create failed: %s", exc)
         return
-    if not chat.is_alive():
-        logger.error("Live chat not available for %s", video_id)
         return
-    logger.info("Live chat connected for %s", video_id)
-    while chat.is_alive() and not stop_event.is_set():
-        try:
-            for c in chat.get().sync_items():
-                if stop_event.is_set():
-                    break
-                text   = c.message.strip()
-                author = c.author.name
-                if not text:
-                    continue
-                sentiment, s_conf = _safe_sentiment(text)
-                topic,     t_conf = _safe_topic(text)
-                message_data = {
-                    "author":     author,
-                    "text":       text,
-                    "sentiment":  sentiment,
-                    "confidence": round(s_conf, 3),
-                    "topic":      topic,
-                    "topic_conf": round(t_conf, 3),
-                    "time":       datetime.now().isoformat(),
-                }
-                store_rpush(redis_key, json.dumps(message_data))
-        except Exception as exc:
-            if not stop_event.is_set():
-                logger.error("Scraper error: %s", exc, exc_info=True)
-        if not stop_event.is_set():
-            time.sleep(1)
     logger.info("Scraper thread ended — key=%s", redis_key)

         return "General", 0.50
+def _get_live_chat_id(video_id: str, api_key: str) -> str | None:
+    """Fetch the liveChatId for a given video using YouTube Data API v3."""
+    import urllib.request
+    import urllib.parse
+    url = (
+        "https://www.googleapis.com/youtube/v3/videos"
+        f"?part=liveStreamingDetails&id={urllib.parse.quote(video_id)}&key={api_key}"
+    )
+    try:
+        with urllib.request.urlopen(url, timeout=10) as resp:
+            data = json.loads(resp.read())
+        items = data.get("items", [])
+        if not items:
+            logger.error("No video found for id=%s", video_id)
+            return None
+        live_details = items[0].get("liveStreamingDetails", {})
+        chat_id = live_details.get("activeLiveChatId")
+        if not chat_id:
+            logger.error("No active live chat for video id=%s", video_id)
+        return chat_id
+    except Exception as exc:
+        logger.error("Failed to get liveChatId: %s", exc)
+        return None
+def _fetch_chat_messages(live_chat_id: str, api_key: str, page_token: str | None = None):
+    """
+    Fetch one page of live chat messages.
+    Returns (messages_list, next_page_token, polling_interval_ms).
+    """
+    import urllib.request
+    import urllib.parse
+    params = {
+        "part": "snippet,authorDetails",
+        "liveChatId": live_chat_id,
+        "key": api_key,
+        "maxResults": "200",
+    }
+    if page_token:
+        params["pageToken"] = page_token
+    url = "https://www.googleapis.com/youtube/v3/liveChat/messages?" + urllib.parse.urlencode(params)
     try:
+        with urllib.request.urlopen(url, timeout=10) as resp:
+            data = json.loads(resp.read())
+        messages      = data.get("items", [])
+        next_token    = data.get("nextPageToken")
+        poll_interval = data.get("pollingIntervalMillis", 5000)
+        return messages, next_token, poll_interval
     except Exception as exc:
+        logger.error("Failed to fetch chat messages: %s", exc)
+        return [], None, 5000
+def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Event) -> None:
+    """Background thread — scrapes live chat via YouTube Data API v3."""
+    api_key = os.getenv("YOUTUBE_API_KEY", "")
+    if not api_key:
+        logger.error("YOUTUBE_API_KEY env var not set. Cannot start scraper.")
         return
+    logger.info("Scraper thread starting — video=%s key=%s", video_id, redis_key)
+    # Step 1: get the live chat ID
+    live_chat_id = _get_live_chat_id(video_id, api_key)
+    if not live_chat_id:
+        logger.error("Could not get live chat ID for video=%s", video_id)
         return
+    logger.info("Live chat ID obtained: %s", live_chat_id)
+    # Step 2: poll for messages
+    page_token    = None
+    seen_ids: set = set()   # avoid reprocessing messages on first page
+    while not stop_event.is_set():
+        messages, page_token, poll_ms = _fetch_chat_messages(live_chat_id, api_key, page_token)
+        for item in messages:
+            if stop_event.is_set():
+                break
+            msg_id = item.get("id", "")
+            if msg_id in seen_ids:
+                continue
+            seen_ids.add(msg_id)
+            snippet = item.get("snippet", {})
+            # only process text messages
+            if snippet.get("type") != "textMessageEvent":
+                continue
+            text   = snippet.get("displayMessage", "").strip()
+            author = item.get("authorDetails", {}).get("displayName", "Unknown")
+            if not text:
+                continue
+            sentiment, s_conf = _safe_sentiment(text)
+            topic,     t_conf = _safe_topic(text)
+            message_data = {
+                "author":     author,
+                "text":       text,
+                "sentiment":  sentiment,
+                "confidence": round(s_conf, 3),
+                "topic":      topic,
+                "topic_conf": round(t_conf, 3),
+                "time":       datetime.now().isoformat(),
+            }
+            store_rpush(redis_key, json.dumps(message_data))
+        # keep seen_ids from growing unbounded
+        if len(seen_ids) > 5000:
+            seen_ids = set(list(seen_ids)[-2000:])
+        # respect YouTube's requested polling interval (min 3s to be safe)
+        wait_s = max(poll_ms / 1000, 3.0)
+        stop_event.wait(timeout=wait_s)
     logger.info("Scraper thread ended — key=%s", redis_key)

requirements.txt CHANGED Viewed

@@ -7,8 +7,7 @@ sentencepiece>=0.1.99
 emoji>=2.10.0
 deep-translator>=1.11.4
-# Live chat scraping
-pytchat>=0.5.5
 # Dashboard
 streamlit>=1.35.0

 emoji>=2.10.0
 deep-translator>=1.11.4
+# Live chat scraping (now uses YouTube Data API v3 — no extra package needed)
 # Dashboard
 streamlit>=1.35.0