fromis_9/backend/scrape_search.cjs

230 lines
6.2 KiB
JavaScript
Raw Normal View History

const https = require("https");
const http = require("http");
const mysql = require("mysql2/promise");
// 설정
const NITTER_URL = "http://nitter:8080";
const USERNAME = "realfromis_9";
const DELAY_MS = 1500;
// 검색 기간 (X 계정 이관일 ~ 기존 스크래핑 시작점)
const SEARCH_SINCE = "2025-04-24";
const SEARCH_UNTIL = "2025-06-16";
// DB 연결
const dbConfig = {
host: process.env.DB_HOST || "mariadb",
user: process.env.DB_USER || "fromis9_user",
password: process.env.DB_PASSWORD || "fromis9_password",
database: process.env.DB_NAME || "fromis9",
};
async function fetchPage(url) {
return new Promise((resolve, reject) => {
const client = url.startsWith("https") ? https : http;
client
.get(url, (res) => {
let data = "";
res.on("data", (chunk) => (data += chunk));
res.on("end", () => resolve(data));
})
.on("error", reject);
});
}
function parseDateTime(timeStr) {
if (!timeStr) return null;
try {
const cleaned = timeStr.replace(" · ", " ").replace(" UTC", "");
const date = new Date(cleaned + " UTC");
if (isNaN(date.getTime())) return null;
return date.toISOString().slice(0, 19).replace("T", " ");
} catch (e) {
return null;
}
}
function extractSearchTweets(html) {
const tweets = [];
const tweetContainers = html.split('class="timeline-item ');
for (let i = 1; i < tweetContainers.length; i++) {
const container = tweetContainers[i];
const tweet = {};
tweet.isPinned = false;
tweet.isRetweet = container.includes('class="retweet-header"');
const linkMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/);
tweet.id = linkMatch ? linkMatch[1] : null;
const timeMatch = container.match(
/<span class="tweet-date"[^>]*><a[^>]*title="([^"]+)"/
);
tweet.time = timeMatch ? parseDateTime(timeMatch[1]) : null;
const contentMatch = container.match(
/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/
);
if (contentMatch) {
tweet.text = contentMatch[1]
.replace(/<br\s*\/?>/g, "\n")
.replace(/<a[^>]*>([^<]*)<\/a>/g, "$1")
.replace(/<[^>]+>/g, "")
.trim();
}
const imageMatches = container.match(/href="\/pic\/([^"]+)"/g);
tweet.images = [];
if (imageMatches) {
imageMatches.forEach((match) => {
const urlMatch = match.match(/href="\/pic\/([^"]+)"/);
if (urlMatch) {
const decoded = decodeURIComponent(urlMatch[1]);
tweet.images.push("https://pbs.twimg.com/" + decoded);
}
});
}
tweet.hasVideo =
container.includes("gallery-video") ||
container.includes("video-container");
tweet.url = tweet.id
? `https://x.com/${USERNAME}/status/${tweet.id}`
: null;
if (tweet.id) {
tweets.push(tweet);
}
}
return tweets;
}
function extractNextCursor(html) {
const cursorMatch = html.match(
/class="show-more"[^>]*>\s*<a href="[^"]*cursor=([^"&]+)/
);
return cursorMatch ? cursorMatch[1] : null;
}
async function saveTweets(pool, tweets) {
let saved = 0;
for (const tweet of tweets) {
try {
const [result] = await pool.query(
`INSERT IGNORE INTO x_tweets (id, username, text, created_at, is_retweet, is_pinned, images, has_video, url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
[
tweet.id,
USERNAME,
tweet.text,
tweet.time,
tweet.isRetweet,
tweet.isPinned,
JSON.stringify(tweet.images),
tweet.hasVideo,
tweet.url,
]
);
if (result.affectedRows > 0) saved++;
} catch (e) {
console.error(`저장 오류 (ID: ${tweet.id}):`, e.message);
}
}
return saved;
}
async function main() {
console.log("=".repeat(60));
console.log("X 트윗 검색 스크래핑 (누락 기간)");
console.log("=".repeat(60));
console.log(`대상: @${USERNAME}`);
console.log(`기간: ${SEARCH_SINCE} ~ ${SEARCH_UNTIL}`);
console.log("");
const pool = await mysql.createPool(dbConfig);
const searchQuery = encodeURIComponent(
`from:${USERNAME} since:${SEARCH_SINCE} until:${SEARCH_UNTIL}`
);
let cursor = null;
let pageNum = 1;
let totalSaved = 0;
let consecutiveEmpty = 0;
while (true) {
const url = cursor
? `${NITTER_URL}/search?f=tweets&q=${searchQuery}&cursor=${cursor}`
: `${NITTER_URL}/search?f=tweets&q=${searchQuery}`;
console.log(`[페이지 ${pageNum}] 검색 중...`);
try {
const html = await fetchPage(url);
const tweets = extractSearchTweets(html);
if (tweets.length === 0) {
consecutiveEmpty++;
console.log(` -> 트윗 없음 (연속 ${consecutiveEmpty}회)`);
if (consecutiveEmpty >= 3) {
console.log("\n연속 3페이지 트윗 없음. 스크래핑 완료.");
break;
}
} else {
consecutiveEmpty = 0;
const saved = await saveTweets(pool, tweets);
totalSaved += saved;
console.log(
` -> ${tweets.length}개 추출, ${saved}개 저장 (누적: ${totalSaved})`
);
}
const nextCursor = extractNextCursor(html);
if (!nextCursor) {
console.log("\n다음 페이지 없음. 스크래핑 완료.");
break;
}
cursor = nextCursor;
pageNum++;
await new Promise((r) => setTimeout(r, DELAY_MS));
} catch (error) {
console.error(` -> 오류: ${error.message}`);
consecutiveEmpty++;
if (consecutiveEmpty >= 5) {
console.log("\n연속 오류. 스크래핑 중단.");
break;
}
await new Promise((r) => setTimeout(r, DELAY_MS * 3));
}
}
console.log("\n" + "=".repeat(60));
console.log("검색 스크래핑 완료");
console.log(`추가 저장: ${totalSaved}`);
console.log("=".repeat(60));
const [stats] = await pool.query(`
SELECT
COUNT(*) as total,
SUM(is_retweet) as retweets,
SUM(NOT is_retweet) as original,
MIN(created_at) as oldest,
MAX(created_at) as newest
FROM x_tweets
`);
console.log("\n[전체 통계]");
console.log(stats[0]);
await pool.end();
process.exit(0);
}
main().catch((err) => {
console.error("치명적 오류:", err);
process.exit(1);
});