fromis_9/backend/src/services/x/scraper.js

import { parseNitterDateTime } from '../../utils/date.js';

/**
 * 트윗 텍스트에서 첫 문단 추출 (title용)
 */
export function extractTitle(text) {
  if (!text) return '';
  const paragraphs = text.split(/\n\n+/);
  return paragraphs[0]?.trim() || '';
}

/**
 * HTML에서 이미지 URL 추출
 */
export function extractImageUrls(html) {
  const urls = [];
  const regex = /href="\/pic\/(orig\/)?media%2F([^"]+)"/g;
  let match;
  while ((match = regex.exec(html)) !== null) {
    const mediaPath = decodeURIComponent(match[2]);
    const cleanPath = mediaPath.split('%3F')[0].split('?')[0];
    urls.push(`https://pbs.twimg.com/media/${cleanPath}`);
  }
  return [...new Set(urls)];
}

/**
 * 텍스트에서 유튜브 videoId 추출
 */
export function extractYoutubeVideoIds(text) {
  if (!text) return [];
  const ids = new Set();

  // youtu.be/{id}
  const shortRegex = /youtu\.be\/([a-zA-Z0-9_-]{11})/g;
  let m;
  while ((m = shortRegex.exec(text)) !== null) {
    ids.add(m[1]);
  }

  // youtube.com/watch?v={id}
  const watchRegex = /youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})/g;
  while ((m = watchRegex.exec(text)) !== null) {
    ids.add(m[1]);
  }

  // youtube.com/shorts/{id}
  const shortsRegex = /youtube\.com\/shorts\/([a-zA-Z0-9_-]{11})/g;
  while ((m = shortsRegex.exec(text)) !== null) {
    ids.add(m[1]);
  }

  return [...ids];
}

/**
 * HTML에서 프로필 정보 추출
 */
export function extractProfile(html) {
  const profile = { displayName: null, avatarUrl: null };

  const nameMatch = html.match(/class="profile-card-fullname"[^>]*title="([^"]+)"/);
  if (nameMatch) {
    profile.displayName = nameMatch[1].trim();
  }

  const avatarMatch = html.match(/class="profile-card-avatar"[^>]*>[\s\S]*?<img[^>]*src="([^"]+)"/);
  if (avatarMatch) {
    let url = avatarMatch[1];
    const encodedMatch = url.match(/\/pic\/(.+)/);
    if (encodedMatch) {
      url = decodeURIComponent(encodedMatch[1]);
    }
    profile.avatarUrl = url;
  }

  return profile;
}

/**
 * HTML에서 트윗 목록 파싱
 */
export function parseTweets(html, username) {
  const tweets = [];
  const containers = html.split('class="timeline-item ');

  for (let i = 1; i < containers.length; i++) {
    const container = containers[i];

    // 고정/리트윗 제외
    const isPinned = container.includes('class="pinned"');
    const isRetweet = container.includes('class="retweet-header"');
    if (isPinned || isRetweet) continue;

    // 트윗 ID
    const idMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/);
    if (!idMatch) continue;
    const id = idMatch[1];

    // 시간
    const timeMatch = container.match(/<span class="tweet-date"[^>]*><a[^>]*title="([^"]+)"/);
    const time = timeMatch ? parseNitterDateTime(timeMatch[1]) : null;
    if (!time) continue;

    // 텍스트
    const contentMatch = container.match(/<div class="tweet-content[^"]*"[^>]*>([\s\S]*?)<\/div>/);
    let text = '';
    if (contentMatch) {
      text = contentMatch[1]
        .replace(/<br\s*\/?>/g, '\n')
        .replace(/<a[^>]*>([^<]*)<\/a>/g, '$1')
        .replace(/<[^>]+>/g, '')
        .trim();
    }

    // 이미지
    const imageUrls = extractImageUrls(container);

    tweets.push({
      id,
      time,
      text,
      imageUrls,
      url: `https://x.com/${username}/status/${id}`,
    });
  }

  return tweets;
}

/**
 * Nitter에서 트윗 수집 (첫 페이지만)
 */
export async function fetchTweets(nitterUrl, username) {
  const url = `${nitterUrl}/${username}`;
  const res = await fetch(url);
  const html = await res.text();

  // 프로필 정보
  const profile = extractProfile(html);

  // 트윗 파싱
  const tweets = parseTweets(html, username);

  return { tweets, profile };
}

/**
 * Nitter에서 전체 트윗 수집 (페이지네이션)
 */
export async function fetchAllTweets(nitterUrl, username, log) {
  const allTweets = [];
  let cursor = null;
  let pageNum = 1;
  let emptyCount = 0;

  while (true) {
    const url = cursor
      ? `${nitterUrl}/${username}?cursor=${cursor}`
      : `${nitterUrl}/${username}`;

    log?.info(`[페이지 ${pageNum}] 스크래핑 중...`);

    try {
      const res = await fetch(url);
      const html = await res.text();
      const tweets = parseTweets(html, username);

      if (tweets.length === 0) {
        emptyCount++;
        if (emptyCount >= 3) break;
      } else {
        emptyCount = 0;
        allTweets.push(...tweets);
        log?.info(`  -> ${tweets.length}개 추출 (누적: ${allTweets.length})`);
      }

      // 다음 페이지 cursor
      const cursorMatch = html.match(/class="show-more"[^>]*>\s*<a href="\?cursor=([^"]+)"/);
      if (!cursorMatch) break;

      cursor = cursorMatch[1];
      pageNum++;

      await new Promise(r => setTimeout(r, 1000));
    } catch (err) {
      log?.error(`  -> 오류: ${err.message}`);
      emptyCount++;
      if (emptyCount >= 5) break;
      await new Promise(r => setTimeout(r, 3000));
    }
  }

  return allTweets;
}
refactor: Express에서 Fastify로 백엔드 마이그레이션 - Express → Fastify 5 프레임워크 전환 - 플러그인 기반 아키텍처로 재구성 - plugins/db.js: MariaDB 연결 풀 - plugins/redis.js: Redis 클라이언트 - plugins/scheduler.js: 봇 스케줄러 (node-cron) - 봇 설정 방식 변경: DB 테이블 → 설정 파일 (config/bots.js) - 봇 상태 저장: DB → Redis - YouTube/X 봇 서비스 분리 및 개선 - 날짜 유틸리티 KST 변환 수정 - 미사용 환경변수 정리 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> 2026-01-16 21:11:02 +09:00			`import { parseNitterDateTime } from '../../utils/date.js';`

			`/**`
			`* 트윗 텍스트에서 첫 문단 추출 (title용)`
			`*/`
			`export function extractTitle(text) {`
			`if (!text) return '';`
			`const paragraphs = text.split(/\n\n+/);`
			`return paragraphs[0]?.trim() \|\| '';`
			`}`

			`/**`
			`* HTML에서 이미지 URL 추출`
			`*/`
			`export function extractImageUrls(html) {`
			`const urls = [];`
			`const regex = /href="\/pic\/(orig\/)?media%2F([^"]+)"/g;`
			`let match;`
			`while ((match = regex.exec(html)) !== null) {`
			`const mediaPath = decodeURIComponent(match[2]);`
			`const cleanPath = mediaPath.split('%3F')[0].split('?')[0];`
			urls.push(`https://pbs.twimg.com/media/${cleanPath}`);
			`}`
			`return [...new Set(urls)];`
			`}`

			`/**`
			`* 텍스트에서 유튜브 videoId 추출`
			`*/`
			`export function extractYoutubeVideoIds(text) {`
			`if (!text) return [];`
			`const ids = new Set();`

			`// youtu.be/{id}`
			`const shortRegex = /youtu\.be\/([a-zA-Z0-9_-]{11})/g;`
			`let m;`
			`while ((m = shortRegex.exec(text)) !== null) {`
			`ids.add(m[1]);`
			`}`

			`// youtube.com/watch?v={id}`
			`const watchRegex = /youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})/g;`
			`while ((m = watchRegex.exec(text)) !== null) {`
			`ids.add(m[1]);`
			`}`

			`// youtube.com/shorts/{id}`
			`const shortsRegex = /youtube\.com\/shorts\/([a-zA-Z0-9_-]{11})/g;`
			`while ((m = shortsRegex.exec(text)) !== null) {`
			`ids.add(m[1]);`
			`}`

			`return [...ids];`
			`}`

			`/**`
			`* HTML에서 프로필 정보 추출`
			`*/`
			`export function extractProfile(html) {`
			`const profile = { displayName: null, avatarUrl: null };`

			`const nameMatch = html.match(/class="profile-card-fullname"[^>]*title="([^"]+)"/);`
			`if (nameMatch) {`
			`profile.displayName = nameMatch[1].trim();`
			`}`

			`const avatarMatch = html.match(/class="profile-card-avatar"[^>]>[\s\S]?<img[^>]*src="([^"]+)"/);`
			`if (avatarMatch) {`
			`let url = avatarMatch[1];`
			`const encodedMatch = url.match(/\/pic\/(.+)/);`
			`if (encodedMatch) {`
			`url = decodeURIComponent(encodedMatch[1]);`
			`}`
			`profile.avatarUrl = url;`
			`}`

			`return profile;`
			`}`

			`/**`
			`* HTML에서 트윗 목록 파싱`
			`*/`
			`export function parseTweets(html, username) {`
			`const tweets = [];`
			`const containers = html.split('class="timeline-item ');`

			`for (let i = 1; i < containers.length; i++) {`
			`const container = containers[i];`

			`// 고정/리트윗 제외`
			`const isPinned = container.includes('class="pinned"');`
			`const isRetweet = container.includes('class="retweet-header"');`
			`if (isPinned \|\| isRetweet) continue;`

			`// 트윗 ID`
			`const idMatch = container.match(/href="\/[^\/]+\/status\/(\d+)/);`
			`if (!idMatch) continue;`
			`const id = idMatch[1];`

			`// 시간`
			`const timeMatch = container.match(/<span class="tweet-date"[^>]><a[^>]title="([^"]+)"/);`
			`const time = timeMatch ? parseNitterDateTime(timeMatch[1]) : null;`
			`if (!time) continue;`

			`// 텍스트`
			`const contentMatch = container.match(/<div class="tweet-content[^"]"[^>]>([\s\S]*?)<\/div>/);`
			`let text = '';`
			`if (contentMatch) {`
			`text = contentMatch[1]`
			`.replace(/<br\s*\/?>/g, '\n')`
			`.replace(/<a[^>]>([^<])<\/a>/g, '$1')`
			`.replace(/<[^>]+>/g, '')`
			`.trim();`
			`}`

			`// 이미지`
			`const imageUrls = extractImageUrls(container);`

			`tweets.push({`
			`id,`
			`time,`
			`text,`
			`imageUrls,`
			url: `https://x.com/${username}/status/${id}`,
			`});`
			`}`

			`return tweets;`
			`}`

			`/**`
			`* Nitter에서 트윗 수집 (첫 페이지만)`
			`*/`
			`export async function fetchTweets(nitterUrl, username) {`
			const url = `${nitterUrl}/${username}`;
			`const res = await fetch(url);`
			`const html = await res.text();`

			`// 프로필 정보`
			`const profile = extractProfile(html);`

			`// 트윗 파싱`
			`const tweets = parseTweets(html, username);`

			`return { tweets, profile };`
			`}`

			`/**`
			`* Nitter에서 전체 트윗 수집 (페이지네이션)`
			`*/`
			`export async function fetchAllTweets(nitterUrl, username, log) {`
			`const allTweets = [];`
			`let cursor = null;`
			`let pageNum = 1;`
			`let emptyCount = 0;`

			`while (true) {`
			`const url = cursor`
			? `${nitterUrl}/${username}?cursor=${cursor}`
			: `${nitterUrl}/${username}`;

			log?.info(`[페이지 ${pageNum}] 스크래핑 중...`);

			`try {`
			`const res = await fetch(url);`
			`const html = await res.text();`
			`const tweets = parseTweets(html, username);`

			`if (tweets.length === 0) {`
			`emptyCount++;`
			`if (emptyCount >= 3) break;`
			`} else {`
			`emptyCount = 0;`
			`allTweets.push(...tweets);`
			log?.info(` -> ${tweets.length}개 추출 (누적: ${allTweets.length})`);
			`}`

			`// 다음 페이지 cursor`
			`const cursorMatch = html.match(/class="show-more"[^>]>\s<a href="\?cursor=([^"]+)"/);`
			`if (!cursorMatch) break;`

			`cursor = cursorMatch[1];`
			`pageNum++;`

			`await new Promise(r => setTimeout(r, 1000));`
			`} catch (err) {`
			log?.error(` -> 오류: ${err.message}`);
			`emptyCount++;`
			`if (emptyCount >= 5) break;`
			`await new Promise(r => setTimeout(r, 3000));`
			`}`
			`}`

			`return allTweets;`
			`}`