From 3ce8d7ec7d80b039eaab02920da7ea8dc3a13978 Mon Sep 17 00:00:00 2001 From: caadiq Date: Sun, 29 Mar 2026 13:59:19 +0900 Subject: [PATCH] =?UTF-8?q?fix(x-bot):=20=EB=A6=AC=ED=8A=B8=EC=9C=97=20?= =?UTF-8?q?=EB=82=B4=EC=9A=A9=20=EC=9E=98=EB=A6=BC,=20Nitter=20=EB=A7=81?= =?UTF-8?q?=ED=81=AC,=20=EC=9D=B4=EB=AF=B8=EC=A7=80=20=EB=88=84=EB=9D=BD?= =?UTF-8?q?=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - extractTextFromHtml: Nitter 프록시 t.co URL을 원본 https://t.co/ URL로 변환 - parseTweets: 리트윗 원본 작성자(originalUsername) 추출, URL을 원본 작성자 기준으로 생성 - saveTweet: 리트윗인 경우 원본 작성자를 username으로 저장 - refetch-retweets 엔드포인트 및 스크립트 추가 (기존 잘못된 데이터 재수집) Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/scripts/refetch-retweets.js | 95 ++++++++++++++++++++++ backend/src/routes/admin/x.js | 120 +++++++++++++++++++++++++++- backend/src/services/x/index.js | 5 +- backend/src/services/x/scraper.js | 27 ++++++- 4 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 backend/scripts/refetch-retweets.js diff --git a/backend/scripts/refetch-retweets.js b/backend/scripts/refetch-retweets.js new file mode 100644 index 0000000..7ecc190 --- /dev/null +++ b/backend/scripts/refetch-retweets.js @@ -0,0 +1,95 @@ +/** + * 리트윗 데이터 재수집 스크립트 + * 잘못 저장된 리트윗 일정을 Nitter에서 다시 가져와 수정합니다. + * + * 사용법: node scripts/refetch-retweets.js [scheduleId1,scheduleId2,...] + */ +import mysql from 'mysql2/promise'; +import { fetchSingleTweet, extractTitle } from '../src/services/x/scraper.js'; + +const NITTER_URL = process.env.NITTER_URL || 'http://nitter:8080'; + +const pool = mysql.createPool({ + host: process.env.DB_HOST || 'mariadb', + port: parseInt(process.env.DB_PORT || '3306'), + user: process.env.DB_USER || 'fromis9', + password: process.env.DB_PASSWORD || 'fromis9', + database: process.env.DB_NAME || 'fromis9', +}); + +async function main() { + // CLI에서 특정 ID 지정 가능 + const argIds = process.argv[2]?.split(',').map(Number).filter(Boolean); + + let rows; + if (argIds && argIds.length > 0) { + [rows] = await pool.query( + `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content + FROM schedule_x sx WHERE sx.schedule_id IN (?)`, + [argIds] + ); + } else { + [rows] = await pool.query( + `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content + FROM schedule_x sx + WHERE sx.content LIKE 'RT @%' OR sx.content LIKE '%nitter%t.co%'` + ); + } + + console.log(`대상: ${rows.length}건`); + if (rows.length === 0) { + await pool.end(); + return; + } + + let updated = 0; + let failed = 0; + + for (const row of rows) { + try { + // RT @username: 에서 원본 작성자 추출 + const rtMatch = row.content?.match(/^RT @(\w+):/); + const fetchUsername = rtMatch ? rtMatch[1] : (row.username || 'realfromis_9'); + + console.log(`[${row.schedule_id}] post_id=${row.post_id}, from=@${fetchUsername}`); + + const tweet = await fetchSingleTweet(NITTER_URL, fetchUsername, row.post_id); + + // RT @ 프리픽스 제거 + let newContent = tweet.text; + const rtPrefixMatch = newContent.match(/^RT @\w+:\s*/); + if (rtPrefixMatch) { + newContent = newContent.slice(rtPrefixMatch[0].length); + } + // 끝의 … 제거 + newContent = newContent.replace(/…$/, '').trim(); + + const newTitle = extractTitle(newContent); + const newImageUrls = tweet.imageUrls.length > 0 ? JSON.stringify(tweet.imageUrls) : null; + + // DB 업데이트 + await pool.query('UPDATE schedules SET title = ? WHERE id = ?', [newTitle, row.schedule_id]); + await pool.query( + 'UPDATE schedule_x SET username = ?, content = ?, image_urls = ? WHERE schedule_id = ?', + [fetchUsername, newContent, newImageUrls, row.schedule_id] + ); + + console.log(` -> title: ${newTitle.substring(0, 60)} | images: ${tweet.imageUrls.length}`); + updated++; + + // Nitter 부하 방지 + await new Promise(r => setTimeout(r, 500)); + } catch (err) { + console.error(` -> 실패: ${err.message}`); + failed++; + } + } + + console.log(`\n완료: ${updated}건 수정, ${failed}건 실패`); + await pool.end(); +} + +main().catch(err => { + console.error(err); + process.exit(1); +}); diff --git a/backend/src/routes/admin/x.js b/backend/src/routes/admin/x.js index 6ec622c..0a8de88 100644 --- a/backend/src/routes/admin/x.js +++ b/backend/src/routes/admin/x.js @@ -1,5 +1,5 @@ import { fetchSingleTweet, extractTitle } from '../../services/x/scraper.js'; -import { addOrUpdateSchedule } from '../../services/meilisearch/index.js'; +import { addOrUpdateSchedule, syncScheduleById } from '../../services/meilisearch/index.js'; import { formatDate, formatTime } from '../../utils/date.js'; import config, { CATEGORY_IDS } from '../../config/index.js'; import { @@ -161,4 +161,122 @@ export default async function xRoutes(fastify) { return serverError(reply, err.message); } }); + + /** + * POST /api/admin/x/refetch-retweets + * 리트윗 데이터 재수집 (잘못된 content/image_urls 수정) + */ + fastify.post('/refetch-retweets', { + schema: { + tags: ['admin/x'], + summary: '리트윗 데이터 재수집', + description: '잘못 저장된 리트윗 일정을 Nitter에서 다시 가져와 수정합니다.', + security: [{ bearerAuth: [] }], + body: { + type: 'object', + properties: { + scheduleIds: { + type: 'array', + items: { type: 'integer' }, + description: '재수집할 일정 ID 목록 (비어있으면 전체 리트윗 대상)', + }, + }, + }, + }, + preHandler: [fastify.authenticate], + }, async (request, reply) => { + try { + let rows; + const { scheduleIds } = request.body || {}; + + if (scheduleIds && scheduleIds.length > 0) { + // 특정 일정만 + [rows] = await db.query( + `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content + FROM schedule_x sx + WHERE sx.schedule_id IN (?)`, + [scheduleIds] + ); + } else { + // content가 "RT @"로 시작하거나, image_urls가 NULL이면서 nitter 링크가 있는 일정 + [rows] = await db.query( + `SELECT sx.schedule_id, sx.post_id, sx.username, sx.content + FROM schedule_x sx + WHERE sx.content LIKE 'RT @%' + OR (sx.content LIKE '%nitter%t.co%') + OR (sx.image_urls IS NULL AND sx.content LIKE 'RT @%')` + ); + } + + if (rows.length === 0) { + return { success: true, message: '재수집 대상이 없습니다.', updated: 0 }; + } + + let updated = 0; + const errors = []; + + for (const row of rows) { + try { + // content에서 원본 작성자 추출 (RT @username: 형식) + let fetchUsername = row.username || DEFAULT_USERNAME; + const rtMatch = row.content?.match(/^RT @(\w+):/); + if (rtMatch) { + fetchUsername = rtMatch[1]; + } + + // 원본 작성자의 개별 트윗 페이지에서 가져오기 + const tweet = await fetchSingleTweet(NITTER_URL, fetchUsername, row.post_id); + + // fetchSingleTweet이 RT @ 형식을 반환하면 RT 프리픽스 제거 + let newContent = tweet.text; + const rtPrefixMatch = newContent.match(/^RT @\w+:\s*/); + if (rtPrefixMatch) { + newContent = newContent.slice(rtPrefixMatch[0].length); + } + // 끝의 … 제거 + newContent = newContent.replace(/…$/, '').trim(); + + const newTitle = extractTitle(newContent); + const newImageUrls = tweet.imageUrls.length > 0 ? JSON.stringify(tweet.imageUrls) : null; + + // schedules 테이블 업데이트 + await db.query( + 'UPDATE schedules SET title = ? WHERE id = ?', + [newTitle, row.schedule_id] + ); + + // schedule_x 테이블 업데이트 (원본 작성자 username도 수정) + await db.query( + 'UPDATE schedule_x SET username = ?, content = ?, image_urls = ? WHERE schedule_id = ?', + [fetchUsername, newContent, newImageUrls, row.schedule_id] + ); + + // Meilisearch 동기화 + await syncScheduleById(meilisearch, db, row.schedule_id); + + updated++; + fastify.log.info(`리트윗 재수집 완료: schedule_id=${row.schedule_id}, post_id=${row.post_id}`); + + // Nitter 부하 방지 + await new Promise(r => setTimeout(r, 500)); + } catch (err) { + errors.push({ scheduleId: row.schedule_id, postId: row.post_id, error: err.message }); + fastify.log.error(`리트윗 재수집 실패 (${row.schedule_id}): ${err.message}`); + } + } + + logActivity(db, { + actor: 'admin', + action: 'update', + category: 'schedule', + targetType: 'x_schedule', + summary: `리트윗 재수집: ${updated}/${rows.length}건 완료`, + }); + + return { success: true, total: rows.length, updated, errors }; + } catch (err) { + fastify.log.error(`리트윗 재수집 오류: ${err.message}`); + return serverError(reply, err.message); + } + }); } diff --git a/backend/src/services/x/index.js b/backend/src/services/x/index.js index 8a73fab..6d6e95d 100644 --- a/backend/src/services/x/index.js +++ b/backend/src/services/x/index.js @@ -65,6 +65,9 @@ async function xBotPlugin(fastify, opts) { const time = formatTime(tweet.time); const title = extractTitle(tweet.text); + // 리트윗인 경우 원본 작성자를 username으로 사용 + const tweetUsername = tweet.originalUsername || username; + // 트랜잭션으로 INSERT 작업 수행 return withTransaction(fastify.db, async (connection) => { // schedules 테이블에 저장 @@ -80,7 +83,7 @@ async function xBotPlugin(fastify, opts) { [ scheduleId, tweet.id, - username, + tweetUsername, tweet.text, tweet.imageUrls.length > 0 ? JSON.stringify(tweet.imageUrls) : null, ] diff --git a/backend/src/services/x/scraper.js b/backend/src/services/x/scraper.js index e37c410..0f69439 100644 --- a/backend/src/services/x/scraper.js +++ b/backend/src/services/x/scraper.js @@ -112,6 +112,11 @@ function extractTextFromHtml(html) { .replace(//g, '\n') // 태그: href에서 원본 URL 추출 (외부 링크만) .replace(/]*href="([^"]*)"[^>]*>([^<]*)<\/a>/g, (match, href, text) => { + // t.co 링크: Nitter가 프록시한 URL을 원본 t.co URL로 변환 + const tcoMatch = href.match(/\/t\.co\/([^\s"?]+)/); + if (tcoMatch) { + return `https://t.co/${tcoMatch[1]}`; + } // Nitter 내부 링크 (/search, /hashtag 등)는 표시 텍스트 사용 if (href.startsWith('/')) { return text; @@ -146,6 +151,22 @@ export function parseTweets(html, username, options = {}) { const isRetweet = container.includes('class="retweet-header"'); if (isRetweet && !includeRetweets) continue; + // 리트윗인 경우 원본 작성자 추출 (data-username 또는 tweet-header에서) + let originalUsername = null; + if (isRetweet) { + const dataUserMatch = containers[i - 1]?.match(/data-username="([^"]+)"/) || + container.match(/data-username="([^"]+)"/); + if (dataUserMatch) { + originalUsername = dataUserMatch[1]; + } else { + // tweet-header의 username 링크에서 추출 + const headerUserMatch = container.match(/class="username"[^>]*href="\/([^"]+)"/); + if (headerUserMatch) { + originalUsername = headerUserMatch[1]; + } + } + } + // 트윗 ID const idMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/); if (!idMatch) continue; @@ -171,7 +192,11 @@ export function parseTweets(html, username, options = {}) { time, text, imageUrls, - url: `https://x.com/${username}/status/${id}`, + isRetweet, + originalUsername, + url: isRetweet && originalUsername + ? `https://x.com/${originalUsername}/status/${id}` + : `https://x.com/${username}/status/${id}`, }); }