import { parseNitterDateTime } from '../../utils/date.js'; /** * 트윗 텍스트에서 첫 문단 추출 (title용) */ export function extractTitle(text) { if (!text) return ''; const paragraphs = text.split(/\n\n+/); return paragraphs[0]?.trim() || ''; } /** * HTML에서 이미지 URL 추출 */ export function extractImageUrls(html) { const urls = []; const regex = /href="\/pic\/(orig\/)?media%2F([^"]+)"/g; let match; while ((match = regex.exec(html)) !== null) { const mediaPath = decodeURIComponent(match[2]); const cleanPath = mediaPath.split('%3F')[0].split('?')[0]; urls.push(`https://pbs.twimg.com/media/${cleanPath}`); } return [...new Set(urls)]; } /** * 텍스트에서 유튜브 videoId 추출 */ export function extractYoutubeVideoIds(text) { if (!text) return []; const ids = new Set(); // youtu.be/{id} const shortRegex = /youtu\.be\/([a-zA-Z0-9_-]{11})/g; let m; while ((m = shortRegex.exec(text)) !== null) { ids.add(m[1]); } // youtube.com/watch?v={id} const watchRegex = /youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})/g; while ((m = watchRegex.exec(text)) !== null) { ids.add(m[1]); } // youtube.com/shorts/{id} const shortsRegex = /youtube\.com\/shorts\/([a-zA-Z0-9_-]{11})/g; while ((m = shortsRegex.exec(text)) !== null) { ids.add(m[1]); } return [...ids]; } /** * HTML에서 프로필 정보 추출 */ export function extractProfile(html) { const profile = { displayName: null, avatarUrl: null }; const nameMatch = html.match(/class="profile-card-fullname"[^>]*title="([^"]+)"/); if (nameMatch) { profile.displayName = nameMatch[1].trim(); } const avatarMatch = html.match(/class="profile-card-avatar"[^>]*>[\s\S]*?]*src="([^"]+)"/); if (avatarMatch) { let url = avatarMatch[1]; const encodedMatch = url.match(/\/pic\/(.+)/); if (encodedMatch) { url = decodeURIComponent(encodedMatch[1]); } profile.avatarUrl = url; } return profile; } /** * HTML에서 트윗 목록 파싱 */ export function parseTweets(html, username) { const tweets = []; const containers = html.split('class="timeline-item '); for (let i = 1; i < containers.length; i++) { const container = containers[i]; // 고정/리트윗 제외 const isPinned = container.includes('class="pinned"'); const isRetweet = container.includes('class="retweet-header"'); if (isPinned || isRetweet) continue; // 트윗 ID const idMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/); if (!idMatch) continue; const id = idMatch[1]; // 시간 const timeMatch = container.match(/]*>]*title="([^"]+)"/); const time = timeMatch ? parseNitterDateTime(timeMatch[1]) : null; if (!time) continue; // 텍스트 const contentMatch = container.match(/

]*>([\s\S]*?)<\/div>/); let text = ''; if (contentMatch) { text = contentMatch[1] .replace(//g, '\n') .replace(/]*>([^<]*)<\/a>/g, '$1') .replace(/<[^>]+>/g, '') .trim(); } // 이미지 const imageUrls = extractImageUrls(container); tweets.push({ id, time, text, imageUrls, url: `https://x.com/${username}/status/${id}`, }); } return tweets; } /** * Nitter에서 트윗 수집 (첫 페이지만) */ export async function fetchTweets(nitterUrl, username) { const url = `${nitterUrl}/${username}`; const res = await fetch(url); const html = await res.text(); // 프로필 정보 const profile = extractProfile(html); // 트윗 파싱 const tweets = parseTweets(html, username); return { tweets, profile }; } /** * Nitter에서 전체 트윗 수집 (페이지네이션) */ export async function fetchAllTweets(nitterUrl, username, log) { const allTweets = []; let cursor = null; let pageNum = 1; let emptyCount = 0; while (true) { const url = cursor ? `${nitterUrl}/${username}?cursor=${cursor}` : `${nitterUrl}/${username}`; log?.info(`[페이지 ${pageNum}] 스크래핑 중...`); try { const res = await fetch(url); const html = await res.text(); const tweets = parseTweets(html, username); if (tweets.length === 0) { emptyCount++; if (emptyCount >= 3) break; } else { emptyCount = 0; allTweets.push(...tweets); log?.info(` -> ${tweets.length}개 추출 (누적: ${allTweets.length})`); } // 다음 페이지 cursor const cursorMatch = html.match(/class="show-more"[^>]*>\s* setTimeout(r, 1000)); } catch (err) { log?.error(` -> 오류: ${err.message}`); emptyCount++; if (emptyCount >= 5) break; await new Promise(r => setTimeout(r, 3000)); } } return allTweets; }