240 lines
6.3 KiB
JavaScript
240 lines
6.3 KiB
JavaScript
|
|
const https = require("https");
|
||
|
|
const http = require("http");
|
||
|
|
const mysql = require("mysql2/promise");
|
||
|
|
|
||
|
|
// 설정
|
||
|
|
const NITTER_URL = "http://nitter:8080";
|
||
|
|
const USERNAME = "realfromis_9";
|
||
|
|
const DELAY_MS = 1000; // 페이지 간 딜레이
|
||
|
|
|
||
|
|
// DB 연결
|
||
|
|
const dbConfig = {
|
||
|
|
host: process.env.DB_HOST || "mariadb",
|
||
|
|
user: process.env.DB_USER || "fromis9_user",
|
||
|
|
password: process.env.DB_PASSWORD || "fromis9_password",
|
||
|
|
database: process.env.DB_NAME || "fromis9",
|
||
|
|
};
|
||
|
|
|
||
|
|
async function fetchPage(url) {
|
||
|
|
return new Promise((resolve, reject) => {
|
||
|
|
const client = url.startsWith("https") ? https : http;
|
||
|
|
client
|
||
|
|
.get(url, (res) => {
|
||
|
|
let data = "";
|
||
|
|
res.on("data", (chunk) => (data += chunk));
|
||
|
|
res.on("end", () => resolve(data));
|
||
|
|
})
|
||
|
|
.on("error", reject);
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
function parseDateTime(timeStr) {
|
||
|
|
// "Jan 7, 2026 · 12:00 PM UTC" -> MySQL DATETIME
|
||
|
|
if (!timeStr) return null;
|
||
|
|
try {
|
||
|
|
const cleaned = timeStr.replace(" · ", " ").replace(" UTC", "");
|
||
|
|
const date = new Date(cleaned + " UTC");
|
||
|
|
if (isNaN(date.getTime())) return null;
|
||
|
|
return date.toISOString().slice(0, 19).replace("T", " ");
|
||
|
|
} catch (e) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function extractTweets(html) {
|
||
|
|
const tweets = [];
|
||
|
|
const tweetContainers = html.split('class="timeline-item ');
|
||
|
|
|
||
|
|
for (let i = 1; i < tweetContainers.length; i++) {
|
||
|
|
const container = tweetContainers[i];
|
||
|
|
const tweet = {};
|
||
|
|
|
||
|
|
// 고정 트윗 체크
|
||
|
|
tweet.isPinned =
|
||
|
|
tweetContainers[i - 1].includes("pinned") || container.includes("Pinned");
|
||
|
|
|
||
|
|
// 리트윗 체크
|
||
|
|
tweet.isRetweet = container.includes('class="retweet-header"');
|
||
|
|
|
||
|
|
// 트윗 ID 추출
|
||
|
|
const linkMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/);
|
||
|
|
tweet.id = linkMatch ? linkMatch[1] : null;
|
||
|
|
|
||
|
|
// 시간 추출
|
||
|
|
const timeMatch = container.match(
|
||
|
|
/<span class="tweet-date"[^>]*><a[^>]*title="([^"]+)"/
|
||
|
|
);
|
||
|
|
tweet.time = timeMatch ? parseDateTime(timeMatch[1]) : null;
|
||
|
|
|
||
|
|
// 텍스트 내용 추출
|
||
|
|
const contentMatch = container.match(
|
||
|
|
/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/
|
||
|
|
);
|
||
|
|
if (contentMatch) {
|
||
|
|
tweet.text = contentMatch[1]
|
||
|
|
.replace(/<br\s*\/?>/g, "\n")
|
||
|
|
.replace(/<a[^>]*>([^<]*)<\/a>/g, "$1")
|
||
|
|
.replace(/<[^>]+>/g, "")
|
||
|
|
.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
// 이미지 URL 추출
|
||
|
|
const imageMatches = container.match(/href="\/pic\/([^"]+)"/g);
|
||
|
|
tweet.images = [];
|
||
|
|
if (imageMatches) {
|
||
|
|
imageMatches.forEach((match) => {
|
||
|
|
const urlMatch = match.match(/href="\/pic\/([^"]+)"/);
|
||
|
|
if (urlMatch) {
|
||
|
|
const decoded = decodeURIComponent(urlMatch[1]);
|
||
|
|
// 전체 URL로 변환
|
||
|
|
tweet.images.push("https://pbs.twimg.com/" + decoded);
|
||
|
|
}
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
// 비디오 체크
|
||
|
|
tweet.hasVideo =
|
||
|
|
container.includes("gallery-video") ||
|
||
|
|
container.includes("video-container");
|
||
|
|
|
||
|
|
// URL 생성
|
||
|
|
tweet.url = tweet.id
|
||
|
|
? `https://x.com/${USERNAME}/status/${tweet.id}`
|
||
|
|
: null;
|
||
|
|
|
||
|
|
if (tweet.id) {
|
||
|
|
tweets.push(tweet);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return tweets;
|
||
|
|
}
|
||
|
|
|
||
|
|
function extractNextCursor(html) {
|
||
|
|
// Load more 링크에서 cursor 추출
|
||
|
|
const cursorMatch = html.match(/href="\/[^?]+\?cursor=([^"]+)"/);
|
||
|
|
return cursorMatch ? cursorMatch[1] : null;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function saveTweets(pool, tweets) {
|
||
|
|
let saved = 0;
|
||
|
|
for (const tweet of tweets) {
|
||
|
|
try {
|
||
|
|
await pool.query(
|
||
|
|
`
|
||
|
|
INSERT IGNORE INTO x_tweets (id, username, text, created_at, is_retweet, is_pinned, images, has_video, url)
|
||
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
|
|
`,
|
||
|
|
[
|
||
|
|
tweet.id,
|
||
|
|
USERNAME,
|
||
|
|
tweet.text,
|
||
|
|
tweet.time,
|
||
|
|
tweet.isRetweet,
|
||
|
|
tweet.isPinned,
|
||
|
|
JSON.stringify(tweet.images),
|
||
|
|
tweet.hasVideo,
|
||
|
|
tweet.url,
|
||
|
|
]
|
||
|
|
);
|
||
|
|
saved++;
|
||
|
|
} catch (e) {
|
||
|
|
console.error(`저장 오류 (ID: ${tweet.id}):`, e.message);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return saved;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function main() {
|
||
|
|
console.log("=".repeat(60));
|
||
|
|
console.log("X 트윗 전체 스크래핑 시작");
|
||
|
|
console.log("=".repeat(60));
|
||
|
|
console.log(`대상: @${USERNAME}`);
|
||
|
|
console.log(`Nitter: ${NITTER_URL}`);
|
||
|
|
console.log("");
|
||
|
|
|
||
|
|
const pool = await mysql.createPool(dbConfig);
|
||
|
|
|
||
|
|
let cursor = null;
|
||
|
|
let pageNum = 1;
|
||
|
|
let totalSaved = 0;
|
||
|
|
let consecutiveEmpty = 0;
|
||
|
|
|
||
|
|
while (true) {
|
||
|
|
const url = cursor
|
||
|
|
? `${NITTER_URL}/${USERNAME}?cursor=${cursor}`
|
||
|
|
: `${NITTER_URL}/${USERNAME}`;
|
||
|
|
|
||
|
|
console.log(`[페이지 ${pageNum}] 스크래핑 중...`);
|
||
|
|
|
||
|
|
try {
|
||
|
|
const html = await fetchPage(url);
|
||
|
|
const tweets = extractTweets(html);
|
||
|
|
|
||
|
|
if (tweets.length === 0) {
|
||
|
|
consecutiveEmpty++;
|
||
|
|
console.log(` -> 트윗 없음 (연속 ${consecutiveEmpty}회)`);
|
||
|
|
if (consecutiveEmpty >= 3) {
|
||
|
|
console.log("\n연속 3페이지 트윗 없음. 스크래핑 완료.");
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
consecutiveEmpty = 0;
|
||
|
|
const saved = await saveTweets(pool, tweets);
|
||
|
|
totalSaved += saved;
|
||
|
|
console.log(
|
||
|
|
` -> ${tweets.length}개 추출, ${saved}개 저장 (누적: ${totalSaved})`
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
// 다음 페이지 cursor 추출
|
||
|
|
const nextCursor = extractNextCursor(html);
|
||
|
|
if (!nextCursor) {
|
||
|
|
console.log("\n다음 페이지 없음. 스크래핑 완료.");
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
|
||
|
|
cursor = nextCursor;
|
||
|
|
pageNum++;
|
||
|
|
|
||
|
|
// 딜레이
|
||
|
|
await new Promise((r) => setTimeout(r, DELAY_MS));
|
||
|
|
} catch (error) {
|
||
|
|
console.error(` -> 오류: ${error.message}`);
|
||
|
|
consecutiveEmpty++;
|
||
|
|
if (consecutiveEmpty >= 5) {
|
||
|
|
console.log("\n연속 오류. 스크래핑 중단.");
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
await new Promise((r) => setTimeout(r, DELAY_MS * 3));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log("\n" + "=".repeat(60));
|
||
|
|
console.log("스크래핑 완료");
|
||
|
|
console.log(`총 저장: ${totalSaved}개`);
|
||
|
|
console.log("=".repeat(60));
|
||
|
|
|
||
|
|
// 통계 출력
|
||
|
|
const [stats] = await pool.query(`
|
||
|
|
SELECT
|
||
|
|
COUNT(*) as total,
|
||
|
|
SUM(is_retweet) as retweets,
|
||
|
|
SUM(NOT is_retweet) as original,
|
||
|
|
SUM(has_video) as with_video,
|
||
|
|
MIN(created_at) as oldest,
|
||
|
|
MAX(created_at) as newest
|
||
|
|
FROM x_tweets
|
||
|
|
`);
|
||
|
|
console.log("\n[통계]");
|
||
|
|
console.log(stats[0]);
|
||
|
|
|
||
|
|
await pool.end();
|
||
|
|
process.exit(0);
|
||
|
|
}
|
||
|
|
|
||
|
|
main().catch((err) => {
|
||
|
|
console.error("치명적 오류:", err);
|
||
|
|
process.exit(1);
|
||
|
|
});
|