const https = require("https"); const http = require("http"); const mysql = require("mysql2/promise"); // 설정 const NITTER_URL = "http://nitter:8080"; const USERNAME = "realfromis_9"; const DELAY_MS = 1500; // 검색 기간 (X 계정 이관일 ~ 기존 스크래핑 시작점) const SEARCH_SINCE = "2025-04-24"; const SEARCH_UNTIL = "2025-06-16"; // DB 연결 const dbConfig = { host: process.env.DB_HOST || "mariadb", user: process.env.DB_USER || "fromis9_user", password: process.env.DB_PASSWORD || "fromis9_password", database: process.env.DB_NAME || "fromis9", }; async function fetchPage(url) { return new Promise((resolve, reject) => { const client = url.startsWith("https") ? https : http; client .get(url, (res) => { let data = ""; res.on("data", (chunk) => (data += chunk)); res.on("end", () => resolve(data)); }) .on("error", reject); }); } function parseDateTime(timeStr) { if (!timeStr) return null; try { const cleaned = timeStr.replace(" · ", " ").replace(" UTC", ""); const date = new Date(cleaned + " UTC"); if (isNaN(date.getTime())) return null; return date.toISOString().slice(0, 19).replace("T", " "); } catch (e) { return null; } } function extractSearchTweets(html) { const tweets = []; const tweetContainers = html.split('class="timeline-item '); for (let i = 1; i < tweetContainers.length; i++) { const container = tweetContainers[i]; const tweet = {}; tweet.isPinned = false; tweet.isRetweet = container.includes('class="retweet-header"'); const linkMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/); tweet.id = linkMatch ? linkMatch[1] : null; const timeMatch = container.match( /]*>]*title="([^"]+)"/ ); tweet.time = timeMatch ? parseDateTime(timeMatch[1]) : null; const contentMatch = container.match( /
]*>([\s\S]*?)<\/div>/ ); if (contentMatch) { tweet.text = contentMatch[1] .replace(//g, "\n") .replace(/]*>([^<]*)<\/a>/g, "$1") .replace(/<[^>]+>/g, "") .trim(); } const imageMatches = container.match(/href="\/pic\/([^"]+)"/g); tweet.images = []; if (imageMatches) { imageMatches.forEach((match) => { const urlMatch = match.match(/href="\/pic\/([^"]+)"/); if (urlMatch) { const decoded = decodeURIComponent(urlMatch[1]); tweet.images.push("https://pbs.twimg.com/" + decoded); } }); } tweet.hasVideo = container.includes("gallery-video") || container.includes("video-container"); tweet.url = tweet.id ? `https://x.com/${USERNAME}/status/${tweet.id}` : null; if (tweet.id) { tweets.push(tweet); } } return tweets; } function extractNextCursor(html) { const cursorMatch = html.match( /class="show-more"[^>]*>\s* 0) saved++; } catch (e) { console.error(`저장 오류 (ID: ${tweet.id}):`, e.message); } } return saved; } async function main() { console.log("=".repeat(60)); console.log("X 트윗 검색 스크래핑 (누락 기간)"); console.log("=".repeat(60)); console.log(`대상: @${USERNAME}`); console.log(`기간: ${SEARCH_SINCE} ~ ${SEARCH_UNTIL}`); console.log(""); const pool = await mysql.createPool(dbConfig); const searchQuery = encodeURIComponent( `from:${USERNAME} since:${SEARCH_SINCE} until:${SEARCH_UNTIL}` ); let cursor = null; let pageNum = 1; let totalSaved = 0; let consecutiveEmpty = 0; while (true) { const url = cursor ? `${NITTER_URL}/search?f=tweets&q=${searchQuery}&cursor=${cursor}` : `${NITTER_URL}/search?f=tweets&q=${searchQuery}`; console.log(`[페이지 ${pageNum}] 검색 중...`); try { const html = await fetchPage(url); const tweets = extractSearchTweets(html); if (tweets.length === 0) { consecutiveEmpty++; console.log(` -> 트윗 없음 (연속 ${consecutiveEmpty}회)`); if (consecutiveEmpty >= 3) { console.log("\n연속 3페이지 트윗 없음. 스크래핑 완료."); break; } } else { consecutiveEmpty = 0; const saved = await saveTweets(pool, tweets); totalSaved += saved; console.log( ` -> ${tweets.length}개 추출, ${saved}개 저장 (누적: ${totalSaved})` ); } const nextCursor = extractNextCursor(html); if (!nextCursor) { console.log("\n다음 페이지 없음. 스크래핑 완료."); break; } cursor = nextCursor; pageNum++; await new Promise((r) => setTimeout(r, DELAY_MS)); } catch (error) { console.error(` -> 오류: ${error.message}`); consecutiveEmpty++; if (consecutiveEmpty >= 5) { console.log("\n연속 오류. 스크래핑 중단."); break; } await new Promise((r) => setTimeout(r, DELAY_MS * 3)); } } console.log("\n" + "=".repeat(60)); console.log("검색 스크래핑 완료"); console.log(`추가 저장: ${totalSaved}개`); console.log("=".repeat(60)); const [stats] = await pool.query(` SELECT COUNT(*) as total, SUM(is_retweet) as retweets, SUM(NOT is_retweet) as original, MIN(created_at) as oldest, MAX(created_at) as newest FROM x_tweets `); console.log("\n[전체 통계]"); console.log(stats[0]); await pool.end(); process.exit(0); } main().catch((err) => { console.error("치명적 오류:", err); process.exit(1); });