fromis_9/backend/scrape_all.js
caadiq e994aa08ca refactor: API 및 페이지 폴더 구조 정리 (2/3)
- api/schedules, albums, members → api/public/로 이동
- pages/pc/*.jsx → pages/pc/public/로 이동
- pages/mobile/*.jsx → pages/mobile/public/로 이동
- App.jsx 라우터 경로 수정
- 모든 public 페이지의 import 경로 수정
2026-01-09 22:00:14 +09:00

239 lines
6.3 KiB
JavaScript

const https = require("https");
const http = require("http");
const mysql = require("mysql2/promise");
// 설정
const NITTER_URL = "http://nitter:8080";
const USERNAME = "realfromis_9";
const DELAY_MS = 1000; // 페이지 간 딜레이
// DB 연결
const dbConfig = {
host: process.env.DB_HOST || "mariadb",
user: process.env.DB_USER || "fromis9_user",
password: process.env.DB_PASSWORD || "fromis9_password",
database: process.env.DB_NAME || "fromis9",
};
async function fetchPage(url) {
return new Promise((resolve, reject) => {
const client = url.startsWith("https") ? https : http;
client
.get(url, (res) => {
let data = "";
res.on("data", (chunk) => (data += chunk));
res.on("end", () => resolve(data));
})
.on("error", reject);
});
}
function parseDateTime(timeStr) {
// "Jan 7, 2026 · 12:00 PM UTC" -> MySQL DATETIME
if (!timeStr) return null;
try {
const cleaned = timeStr.replace(" · ", " ").replace(" UTC", "");
const date = new Date(cleaned + " UTC");
if (isNaN(date.getTime())) return null;
return date.toISOString().slice(0, 19).replace("T", " ");
} catch (e) {
return null;
}
}
function extractTweets(html) {
const tweets = [];
const tweetContainers = html.split('class="timeline-item ');
for (let i = 1; i < tweetContainers.length; i++) {
const container = tweetContainers[i];
const tweet = {};
// 고정 트윗 체크
tweet.isPinned =
tweetContainers[i - 1].includes("pinned") || container.includes("Pinned");
// 리트윗 체크
tweet.isRetweet = container.includes('class="retweet-header"');
// 트윗 ID 추출
const linkMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/);
tweet.id = linkMatch ? linkMatch[1] : null;
// 시간 추출
const timeMatch = container.match(
/<span class="tweet-date"[^>]*><a[^>]*title="([^"]+)"/
);
tweet.time = timeMatch ? parseDateTime(timeMatch[1]) : null;
// 텍스트 내용 추출
const contentMatch = container.match(
/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/
);
if (contentMatch) {
tweet.text = contentMatch[1]
.replace(/<br\s*\/?>/g, "\n")
.replace(/<a[^>]*>([^<]*)<\/a>/g, "$1")
.replace(/<[^>]+>/g, "")
.trim();
}
// 이미지 URL 추출
const imageMatches = container.match(/href="\/pic\/([^"]+)"/g);
tweet.images = [];
if (imageMatches) {
imageMatches.forEach((match) => {
const urlMatch = match.match(/href="\/pic\/([^"]+)"/);
if (urlMatch) {
const decoded = decodeURIComponent(urlMatch[1]);
// 전체 URL로 변환
tweet.images.push("https://pbs.twimg.com/" + decoded);
}
});
}
// 비디오 체크
tweet.hasVideo =
container.includes("gallery-video") ||
container.includes("video-container");
// URL 생성
tweet.url = tweet.id
? `https://x.com/${USERNAME}/status/${tweet.id}`
: null;
if (tweet.id) {
tweets.push(tweet);
}
}
return tweets;
}
function extractNextCursor(html) {
// Load more 링크에서 cursor 추출
const cursorMatch = html.match(/href="\/[^?]+\?cursor=([^"]+)"/);
return cursorMatch ? cursorMatch[1] : null;
}
async function saveTweets(pool, tweets) {
let saved = 0;
for (const tweet of tweets) {
try {
await pool.query(
`
INSERT IGNORE INTO x_tweets (id, username, text, created_at, is_retweet, is_pinned, images, has_video, url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
`,
[
tweet.id,
USERNAME,
tweet.text,
tweet.time,
tweet.isRetweet,
tweet.isPinned,
JSON.stringify(tweet.images),
tweet.hasVideo,
tweet.url,
]
);
saved++;
} catch (e) {
console.error(`저장 오류 (ID: ${tweet.id}):`, e.message);
}
}
return saved;
}
async function main() {
console.log("=".repeat(60));
console.log("X 트윗 전체 스크래핑 시작");
console.log("=".repeat(60));
console.log(`대상: @${USERNAME}`);
console.log(`Nitter: ${NITTER_URL}`);
console.log("");
const pool = await mysql.createPool(dbConfig);
let cursor = null;
let pageNum = 1;
let totalSaved = 0;
let consecutiveEmpty = 0;
while (true) {
const url = cursor
? `${NITTER_URL}/${USERNAME}?cursor=${cursor}`
: `${NITTER_URL}/${USERNAME}`;
console.log(`[페이지 ${pageNum}] 스크래핑 중...`);
try {
const html = await fetchPage(url);
const tweets = extractTweets(html);
if (tweets.length === 0) {
consecutiveEmpty++;
console.log(` -> 트윗 없음 (연속 ${consecutiveEmpty}회)`);
if (consecutiveEmpty >= 3) {
console.log("\n연속 3페이지 트윗 없음. 스크래핑 완료.");
break;
}
} else {
consecutiveEmpty = 0;
const saved = await saveTweets(pool, tweets);
totalSaved += saved;
console.log(
` -> ${tweets.length}개 추출, ${saved}개 저장 (누적: ${totalSaved})`
);
}
// 다음 페이지 cursor 추출
const nextCursor = extractNextCursor(html);
if (!nextCursor) {
console.log("\n다음 페이지 없음. 스크래핑 완료.");
break;
}
cursor = nextCursor;
pageNum++;
// 딜레이
await new Promise((r) => setTimeout(r, DELAY_MS));
} catch (error) {
console.error(` -> 오류: ${error.message}`);
consecutiveEmpty++;
if (consecutiveEmpty >= 5) {
console.log("\n연속 오류. 스크래핑 중단.");
break;
}
await new Promise((r) => setTimeout(r, DELAY_MS * 3));
}
}
console.log("\n" + "=".repeat(60));
console.log("스크래핑 완료");
console.log(`총 저장: ${totalSaved}`);
console.log("=".repeat(60));
// 통계 출력
const [stats] = await pool.query(`
SELECT
COUNT(*) as total,
SUM(is_retweet) as retweets,
SUM(NOT is_retweet) as original,
SUM(has_video) as with_video,
MIN(created_at) as oldest,
MAX(created_at) as newest
FROM x_tweets
`);
console.log("\n[통계]");
console.log(stats[0]);
await pool.end();
process.exit(0);
}
main().catch((err) => {
console.error("치명적 오류:", err);
process.exit(1);
});