From d1250124a7de8a8893c817c018faa850abf7479a Mon Sep 17 00:00:00 2001
From: caadiq <caadiq@gmail.com>
Date: Wed, 21 Jan 2026 11:46:38 +0900
Subject: [PATCH] =?UTF-8?q?X=20=EB=B4=87=20URL=20=EC=B6=94=EC=B6=9C=20?=
 =?UTF-8?q?=EA=B0=9C=EC=84=A0=20=EB=B0=8F=20=EA=B8=B0=EC=A1=B4=20=EB=8D=B0?=
 =?UTF-8?q?=EC=9D=B4=ED=84=B0=20=EC=88=98=EC=A0=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 트윗 파싱 시 축약된 URL 대신 href의 원본 URL 사용
- extractTextFromHtml 함수 추가
- 기존 트윗 content 업데이트 스크립트 추가

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 backend/scripts/update-x-content.js | 121 ++++++++++++++++++++++++++++
 backend/src/services/x/scraper.js   |  31 ++++---
 2 files changed, 142 insertions(+), 10 deletions(-)
 create mode 100644 backend/scripts/update-x-content.js
diff --git a/backend/scripts/update-x-content.js b/backend/scripts/update-x-content.js
new file mode 100644
index 0000000..90e44e7
--- /dev/null
+++ b/backend/scripts/update-x-content.js
@@ -0,0 +1,121 @@
+/**
+ * 기존 X 트윗의 content를 Nitter에서 다시 가져와서 원본 URL로 업데이트
+ */
+import mysql from 'mysql2/promise';
+
+const NITTER_URL = process.env.NITTER_URL || 'http://nitter:8080';
+const USERNAME = 'realfromis_9';
+
+// DB 연결
+const db = await mysql.createConnection({
+  host: process.env.DB_HOST,
+  user: process.env.DB_USER,
+  password: process.env.DB_PASSWORD,
+  database: process.env.DB_NAME,
+});
+
+/**
+ * 트윗 HTML 컨텐츠에서 텍스트 추출 (링크는 원본 URL 사용)
+ */
+function extractTextFromHtml(html) {
+  return html
+    .replace(/<br\s*\/?>/g, '\n')
+    .replace(/<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>/g, (match, href, text) => {
+      if (href.startsWith('/')) {
+        return text;
+      }
+      return href;
+    })
+    .replace(/<[^>]+>/g, '')
+    .trim();
+}
+
+/**
+ * Nitter에서 단일 트윗 조회
+ */
+async function fetchTweetContent(postId) {
+  const url = `${NITTER_URL}/${USERNAME}/status/${postId}`;
+  const res = await fetch(url);
+
+  if (!res.ok) {
+    throw new Error(`트윗을 찾을 수 없습니다 (${res.status})`);
+  }
+
+  const html = await res.text();
+
+  const mainTweetMatch = html.match(/<div id="m" class="main-tweet">([\s\S]*?)<div id="r" class="replies">/);
+  if (!mainTweetMatch) {
+    throw new Error('트윗 내용을 파싱할 수 없습니다');
+  }
+
+  const container = mainTweetMatch[1];
+  const contentMatch = container.match(/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/);
+
+  if (!contentMatch) {
+    throw new Error('트윗 컨텐츠를 찾을 수 없습니다');
+  }
+
+  return extractTextFromHtml(contentMatch[1]);
+}
+
+// 메인 실행
+async function main() {
+  console.log('X 트윗 content 업데이트 시작...\n');
+
+  // schedule_x에서 모든 트윗 가져오기
+  const [rows] = await db.query(`
+    SELECT sx.schedule_id, sx.post_id, sx.content
+    FROM schedule_x sx
+    ORDER BY sx.schedule_id DESC
+  `);
+
+  console.log(`총 ${rows.length}개의 트윗을 확인합니다.\n`);
+
+  let updated = 0;
+  let skipped = 0;
+  let errors = 0;
+
+  for (const row of rows) {
+    const { schedule_id, post_id, content } = row;
+
+    // 축약된 URL이 있는지 확인 (…로 끝나는 패턴)
+    if (!content || !content.includes('…')) {
+      skipped++;
+      continue;
+    }
+
+    console.log(`[${schedule_id}] post_id: ${post_id} - 업데이트 중...`);
+
+    try {
+      const newContent = await fetchTweetContent(post_id);
+
+      // content가 변경되었는지 확인
+      if (newContent !== content) {
+        await db.query(
+          'UPDATE schedule_x SET content = ? WHERE schedule_id = ?',
+          [newContent, schedule_id]
+        );
+        console.log(`  ✓ 업데이트 완료`);
+        updated++;
+      } else {
+        console.log(`  - 변경 없음`);
+        skipped++;
+      }
+
+      // Rate limiting
+      await new Promise(r => setTimeout(r, 500));
+    } catch (err) {
+      console.log(`  ✗ 오류: ${err.message}`);
+      errors++;
+    }
+  }
+
+  console.log(`\n완료!`);
+  console.log(`  업데이트: ${updated}개`);
+  console.log(`  스킵: ${skipped}개`);
+  console.log(`  오류: ${errors}개`);
+
+  await db.end();
+}
+
+main().catch(console.error);
diff --git a/backend/src/services/x/scraper.js b/backend/src/services/x/scraper.js
index 4e15c39..f42752f 100644
--- a/backend/src/services/x/scraper.js
+++ b/backend/src/services/x/scraper.js
@@ -77,6 +77,25 @@ export function extractProfile(html) {
   return profile;
 }
 
+/**
+ * 트윗 HTML 컨텐츠에서 텍스트 추출 (링크는 원본 URL 사용)
+ */
+function extractTextFromHtml(html) {
+  return html
+    .replace(/<br\s*\/?>/g, '\n')
+    // <a> 태그: href에서 원본 URL 추출 (외부 링크만)
+    .replace(/<a[^>]*href="([^"]*)"[^>]*>([^<]*)<\/a>/g, (match, href, text) => {
+      // Nitter 내부 링크 (/search, /hashtag 등)는 표시 텍스트 사용
+      if (href.startsWith('/')) {
+        return text;
+      }
+      // 외부 링크는 href의 원본 URL 사용
+      return href;
+    })
+    .replace(/<[^>]+>/g, '')
+    .trim();
+}
+
 /**
  * HTML에서 트윗 목록 파싱
  */
@@ -106,11 +125,7 @@ export function parseTweets(html, username) {
     const contentMatch = container.match(/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/);
     let text = '';
     if (contentMatch) {
-      text = contentMatch[1]
-        .replace(/<br\s*\/?>/g, '\n')
-        .replace(/<a[^>]*>([^<]*)<\/a>/g, '$1')
-        .replace(/<[^>]+>/g, '')
-        .trim();
+      text = extractTextFromHtml(contentMatch[1]);
     }
 
     // 이미지
@@ -157,11 +172,7 @@ export async function fetchSingleTweet(nitterUrl, username, postId) {
   const contentMatch = container.match(/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/);
   let text = '';
   if (contentMatch) {
-    text = contentMatch[1]
-      .replace(/<br\s*\/?>/g, '\n')
-      .replace(/<a[^>]*>([^<]*)<\/a>/g, '$1')
-      .replace(/<[^>]+>/g, '')
-      .trim();
+    text = extractTextFromHtml(contentMatch[1]);
   }
 
   // 이미지