const https = require("https"); const http = require("http"); const mysql = require("mysql2/promise"); // 설정 const NITTER_URL = "http://nitter:8080"; const USERNAME = "realfromis_9"; const DELAY_MS = 1000; // 페이지 간 딜레이 // DB 연결 const dbConfig = { host: process.env.DB_HOST || "mariadb", user: process.env.DB_USER || "fromis9_user", password: process.env.DB_PASSWORD || "fromis9_password", database: process.env.DB_NAME || "fromis9", }; async function fetchPage(url) { return new Promise((resolve, reject) => { const client = url.startsWith("https") ? https : http; client .get(url, (res) => { let data = ""; res.on("data", (chunk) => (data += chunk)); res.on("end", () => resolve(data)); }) .on("error", reject); }); } function parseDateTime(timeStr) { // "Jan 7, 2026 · 12:00 PM UTC" -> MySQL DATETIME if (!timeStr) return null; try { const cleaned = timeStr.replace(" · ", " ").replace(" UTC", ""); const date = new Date(cleaned + " UTC"); if (isNaN(date.getTime())) return null; return date.toISOString().slice(0, 19).replace("T", " "); } catch (e) { return null; } } function extractTweets(html) { const tweets = []; const tweetContainers = html.split('class="timeline-item '); for (let i = 1; i < tweetContainers.length; i++) { const container = tweetContainers[i]; const tweet = {}; // 고정 트윗 체크 tweet.isPinned = tweetContainers[i - 1].includes("pinned") || container.includes("Pinned"); // 리트윗 체크 tweet.isRetweet = container.includes('class="retweet-header"'); // 트윗 ID 추출 const linkMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/); tweet.id = linkMatch ? linkMatch[1] : null; // 시간 추출 const timeMatch = container.match( /]*>]*title="([^"]+)"/ ); tweet.time = timeMatch ? parseDateTime(timeMatch[1]) : null; // 텍스트 내용 추출 const contentMatch = container.match( /
]*>([\s\S]*?)<\/div>/ ); if (contentMatch) { tweet.text = contentMatch[1] .replace(//g, "\n") .replace(/]*>([^<]*)<\/a>/g, "$1") .replace(/<[^>]+>/g, "") .trim(); } // 이미지 URL 추출 const imageMatches = container.match(/href="\/pic\/([^"]+)"/g); tweet.images = []; if (imageMatches) { imageMatches.forEach((match) => { const urlMatch = match.match(/href="\/pic\/([^"]+)"/); if (urlMatch) { const decoded = decodeURIComponent(urlMatch[1]); // 전체 URL로 변환 tweet.images.push("https://pbs.twimg.com/" + decoded); } }); } // 비디오 체크 tweet.hasVideo = container.includes("gallery-video") || container.includes("video-container"); // URL 생성 tweet.url = tweet.id ? `https://x.com/${USERNAME}/status/${tweet.id}` : null; if (tweet.id) { tweets.push(tweet); } } return tweets; } function extractNextCursor(html) { // show-more 링크에서 cursor 추출 const cursorMatch = html.match( /class="show-more"[^>]*>\s* 트윗 없음 (연속 ${consecutiveEmpty}회)`); if (consecutiveEmpty >= 3) { console.log("\n연속 3페이지 트윗 없음. 스크래핑 완료."); break; } } else { consecutiveEmpty = 0; const saved = await saveTweets(pool, tweets); totalSaved += saved; console.log( ` -> ${tweets.length}개 추출, ${saved}개 저장 (누적: ${totalSaved})` ); } // 다음 페이지 cursor 추출 const nextCursor = extractNextCursor(html); if (!nextCursor) { console.log("\n다음 페이지 없음. 스크래핑 완료."); break; } cursor = nextCursor; pageNum++; // 딜레이 await new Promise((r) => setTimeout(r, DELAY_MS)); } catch (error) { console.error(` -> 오류: ${error.message}`); consecutiveEmpty++; if (consecutiveEmpty >= 5) { console.log("\n연속 오류. 스크래핑 중단."); break; } await new Promise((r) => setTimeout(r, DELAY_MS * 3)); } } console.log("\n" + "=".repeat(60)); console.log("스크래핑 완료"); console.log(`총 저장: ${totalSaved}개`); console.log("=".repeat(60)); // 통계 출력 const [stats] = await pool.query(` SELECT COUNT(*) as total, SUM(is_retweet) as retweets, SUM(NOT is_retweet) as original, SUM(has_video) as with_video, MIN(created_at) as oldest, MAX(created_at) as newest FROM x_tweets `); console.log("\n[통계]"); console.log(stats[0]); await pool.end(); process.exit(0); } main().catch((err) => { console.error("치명적 오류:", err); process.exit(1); });