fix(x-bot): 리트윗 내용 잘림, Nitter 링크, 이미지 누락 수정

- extractTextFromHtml: Nitter 프록시 t.co URL을 원본 https://t.co/ URL로 변환 - parseTweets: 리트윗 원본 작성자(originalUsername) 추출, URL을 원본 작성자 기준으로 생성 - saveTweet: 리트윗인 경우 원본 작성자를 username으로 저장 - refetch-retweets 엔드포인트 및 스크립트 추가 (기존 잘못된 데이터 재수집) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-29 13:59:19 +09:00 · 2026-03-29 13:59:19 +09:00 · 3ce8d7ec7d
commit 3ce8d7ec7d
parent c37d7e14af
4 changed files with 244 additions and 3 deletions
--- a/backend/scripts/refetch-retweets.js
+++ b/backend/scripts/refetch-retweets.js
@ -0,0 +1,95 @@
+/**
+ * 리트윗 데이터 재수집 스크립트
+ * 잘못 저장된 리트윗 일정을 Nitter에서 다시 가져와 수정합니다.
+ *
+ * 사용법: node scripts/refetch-retweets.js [scheduleId1,scheduleId2,...]
+ */
+import mysql from 'mysql2/promise';
+import { fetchSingleTweet, extractTitle } from '../src/services/x/scraper.js';
+
+const NITTER_URL = process.env.NITTER_URL || 'http://nitter:8080';
+
+const pool = mysql.createPool({
+  host: process.env.DB_HOST || 'mariadb',
+  port: parseInt(process.env.DB_PORT || '3306'),
+  user: process.env.DB_USER || 'fromis9',
+  password: process.env.DB_PASSWORD || 'fromis9',
+  database: process.env.DB_NAME || 'fromis9',
+});
+
+async function main() {
+  // CLI에서 특정 ID 지정 가능
+  const argIds = process.argv[2]?.split(',').map(Number).filter(Boolean);
+
+  let rows;
+  if (argIds && argIds.length > 0) {
+    [rows] = await pool.query(
+      `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content
+       FROM schedule_x sx WHERE sx.schedule_id IN (?)`,
+      [argIds]
+    );
+  } else {
+    [rows] = await pool.query(
+      `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content
+       FROM schedule_x sx
+       WHERE sx.content LIKE 'RT @%' OR sx.content LIKE '%nitter%t.co%'`
+    );
+  }
+
+  console.log(`대상: ${rows.length}건`);
+  if (rows.length === 0) {
+    await pool.end();
+    return;
+  }
+
+  let updated = 0;
+  let failed = 0;
+
+  for (const row of rows) {
+    try {
+      // RT @username: 에서 원본 작성자 추출
+      const rtMatch = row.content?.match(/^RT @(\w+):/);
+      const fetchUsername = rtMatch ? rtMatch[1] : (row.username || 'realfromis_9');
+
+      console.log(`[${row.schedule_id}] post_id=${row.post_id}, from=@${fetchUsername}`);
+
+      const tweet = await fetchSingleTweet(NITTER_URL, fetchUsername, row.post_id);
+
+      // RT @ 프리픽스 제거
+      let newContent = tweet.text;
+      const rtPrefixMatch = newContent.match(/^RT @\w+:\s*/);
+      if (rtPrefixMatch) {
+        newContent = newContent.slice(rtPrefixMatch[0].length);
+      }
+      // 끝의 … 제거
+      newContent = newContent.replace(/…$/, '').trim();
+
+      const newTitle = extractTitle(newContent);
+      const newImageUrls = tweet.imageUrls.length > 0 ? JSON.stringify(tweet.imageUrls) : null;
+
+      // DB 업데이트
+      await pool.query('UPDATE schedules SET title = ? WHERE id = ?', [newTitle, row.schedule_id]);
+      await pool.query(
+        'UPDATE schedule_x SET username = ?, content = ?, image_urls = ? WHERE schedule_id = ?',
+        [fetchUsername, newContent, newImageUrls, row.schedule_id]
+      );
+
+      console.log(`  -> title: ${newTitle.substring(0, 60)} | images: ${tweet.imageUrls.length}`);
+      updated++;
+
+      // Nitter 부하 방지
+      await new Promise(r => setTimeout(r, 500));
+    } catch (err) {
+      console.error(`  -> 실패: ${err.message}`);
+      failed++;
+    }
+  }
+
+  console.log(`\n완료: ${updated}건 수정, ${failed}건 실패`);
+  await pool.end();
+}
+
+main().catch(err => {
+  console.error(err);
+  process.exit(1);
+});
--- a/backend/src/routes/admin/x.js
+++ b/backend/src/routes/admin/x.js
@ -1,5 +1,5 @@
 import { fetchSingleTweet, extractTitle } from '../../services/x/scraper.js';
-import { addOrUpdateSchedule } from '../../services/meilisearch/index.js';
+import { addOrUpdateSchedule, syncScheduleById } from '../../services/meilisearch/index.js';
 import { formatDate, formatTime } from '../../utils/date.js';
 import config, { CATEGORY_IDS } from '../../config/index.js';
 import {
@ -161,4 +161,122 @@ export default async function xRoutes(fastify) {
      return serverError(reply, err.message);
    }
  });
+
+  /**
+   * POST /api/admin/x/refetch-retweets
+   * 리트윗 데이터 재수집 (잘못된 content/image_urls 수정)
+   */
+  fastify.post('/refetch-retweets', {
+    schema: {
+      tags: ['admin/x'],
+      summary: '리트윗 데이터 재수집',
+      description: '잘못 저장된 리트윗 일정을 Nitter에서 다시 가져와 수정합니다.',
+      security: [{ bearerAuth: [] }],
+      body: {
+        type: 'object',
+        properties: {
+          scheduleIds: {
+            type: 'array',
+            items: { type: 'integer' },
+            description: '재수집할 일정 ID 목록 (비어있으면 전체 리트윗 대상)',
+          },
+        },
+      },
+    },
+    preHandler: [fastify.authenticate],
+  }, async (request, reply) => {
+    try {
+      let rows;
+      const { scheduleIds } = request.body || {};
+
+      if (scheduleIds && scheduleIds.length > 0) {
+        // 특정 일정만
+        [rows] = await db.query(
+          `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content
+           FROM schedule_x sx
+           WHERE sx.schedule_id IN (?)`,
+          [scheduleIds]
+        );
+      } else {
+        // content가 "RT @"로 시작하거나, image_urls가 NULL이면서 nitter 링크가 있는 일정
+        [rows] = await db.query(
+          `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content
+           FROM schedule_x sx
+           WHERE sx.content LIKE 'RT @%'
+              OR (sx.content LIKE '%nitter%t.co%')
+              OR (sx.image_urls IS NULL AND sx.content LIKE 'RT @%')`
+        );
+      }
+
+      if (rows.length === 0) {
+        return { success: true, message: '재수집 대상이 없습니다.', updated: 0 };
+      }
+
+      let updated = 0;
+      const errors = [];
+
+      for (const row of rows) {
+        try {
+          // content에서 원본 작성자 추출 (RT @username: 형식)
+          let fetchUsername = row.username || DEFAULT_USERNAME;
+          const rtMatch = row.content?.match(/^RT @(\w+):/);
+          if (rtMatch) {
+            fetchUsername = rtMatch[1];
+          }
+
+          // 원본 작성자의 개별 트윗 페이지에서 가져오기
+          const tweet = await fetchSingleTweet(NITTER_URL, fetchUsername, row.post_id);
+
+          // fetchSingleTweet이 RT @ 형식을 반환하면 RT 프리픽스 제거
+          let newContent = tweet.text;
+          const rtPrefixMatch = newContent.match(/^RT @\w+:\s*/);
+          if (rtPrefixMatch) {
+            newContent = newContent.slice(rtPrefixMatch[0].length);
+          }
+          // 끝의 … 제거
+          newContent = newContent.replace(/…$/, '').trim();
+
+          const newTitle = extractTitle(newContent);
+          const newImageUrls = tweet.imageUrls.length > 0 ? JSON.stringify(tweet.imageUrls) : null;
+
+          // schedules 테이블 업데이트
+          await db.query(
+            'UPDATE schedules SET title = ? WHERE id = ?',
+            [newTitle, row.schedule_id]
+          );
+
+          // schedule_x 테이블 업데이트 (원본 작성자 username도 수정)
+          await db.query(
+            'UPDATE schedule_x SET username = ?, content = ?, image_urls = ? WHERE schedule_id = ?',
+            [fetchUsername, newContent, newImageUrls, row.schedule_id]
+          );
+
+          // Meilisearch 동기화
+          await syncScheduleById(meilisearch, db, row.schedule_id);
+
+          updated++;
+          fastify.log.info(`리트윗 재수집 완료: schedule_id=${row.schedule_id}, post_id=${row.post_id}`);
+
+          // Nitter 부하 방지
+          await new Promise(r => setTimeout(r, 500));
+        } catch (err) {
+          errors.push({ scheduleId: row.schedule_id, postId: row.post_id, error: err.message });
+          fastify.log.error(`리트윗 재수집 실패 (${row.schedule_id}): ${err.message}`);
+        }
+      }
+
+      logActivity(db, {
+        actor: 'admin',
+        action: 'update',
+        category: 'schedule',
+        targetType: 'x_schedule',
+        summary: `리트윗 재수집: ${updated}/${rows.length}건 완료`,
+      });
+
+      return { success: true, total: rows.length, updated, errors };
+    } catch (err) {
+      fastify.log.error(`리트윗 재수집 오류: ${err.message}`);
+      return serverError(reply, err.message);
+    }
+  });
 }
--- a/backend/src/services/x/index.js
+++ b/backend/src/services/x/index.js
@ -65,6 +65,9 @@ async function xBotPlugin(fastify, opts) {
    const time = formatTime(tweet.time);
    const title = extractTitle(tweet.text);

+    // 리트윗인 경우 원본 작성자를 username으로 사용
+    const tweetUsername = tweet.originalUsername || username;
+
    // 트랜잭션으로 INSERT 작업 수행
    return withTransaction(fastify.db, async (connection) => {
      // schedules 테이블에 저장
@ -80,7 +83,7 @@ async function xBotPlugin(fastify, opts) {
        [
          scheduleId,
          tweet.id,
-          username,
+          tweetUsername,
          tweet.text,
          tweet.imageUrls.length > 0 ? JSON.stringify(tweet.imageUrls) : null,
        ]
--- a/backend/src/services/x/scraper.js
+++ b/backend/src/services/x/scraper.js
@ -112,6 +112,11 @@ function extractTextFromHtml(html) {
    .replace(/<br\s*\/?>/g, '\n')
    // <a> 태그: href에서 원본 URL 추출 (외부 링크만)
    .replace(/<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>/g, (match, href, text) => {
+      // t.co 링크: Nitter가 프록시한 URL을 원본 t.co URL로 변환
+      const tcoMatch = href.match(/\/t\.co\/([^\s"?]+)/);
+      if (tcoMatch) {
+        return `https://t.co/${tcoMatch[1]}`;
+      }
      // Nitter 내부 링크 (/search, /hashtag 등)는 표시 텍스트 사용
      if (href.startsWith('/')) {
        return text;
@ -146,6 +151,22 @@ export function parseTweets(html, username, options = {}) {
    const isRetweet = container.includes('class="retweet-header"');
    if (isRetweet && !includeRetweets) continue;

+    // 리트윗인 경우 원본 작성자 추출 (data-username 또는 tweet-header에서)
+    let originalUsername = null;
+    if (isRetweet) {
+      const dataUserMatch = containers[i - 1]?.match(/data-username="([^"]+)"/) ||
+                            container.match(/data-username="([^"]+)"/);
+      if (dataUserMatch) {
+        originalUsername = dataUserMatch[1];
+      } else {
+        // tweet-header의 username 링크에서 추출
+        const headerUserMatch = container.match(/class="username"[^>]*href="\/([^"]+)"/);
+        if (headerUserMatch) {
+          originalUsername = headerUserMatch[1];
+        }
+      }
+    }
+
    // 트윗 ID
    const idMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/);
    if (!idMatch) continue;
@ -171,7 +192,11 @@ export function parseTweets(html, username, options = {}) {
      time,
      text,
      imageUrls,
-      url: `https://x.com/${username}/status/${id}`,
+      isRetweet,
+      originalUsername,
+      url: isRetweet && originalUsername
+        ? `https://x.com/${originalUsername}/status/${id}`
+        : `https://x.com/${username}/status/${id}`,
    });
  }