AINewsCollector/skill/ai-news-collector/collect.js

#!/usr/bin/env node

/**
 * AINewsCollector - AI 新闻采集核心脚本
 */

const fs = require('fs');
const path = require('path');
const { execSync } = require('child_process');

// 配置
const CONFIG_PATH = path.join(__dirname, '../../config.json');
const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json');
const DAILY_DIR = path.join(__dirname, '../../daily');

// 代理配置
const PROXY_URL = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || 'http://127.0.0.1:7890';

// 使用 curl 子进程请求（稳定支持代理）
function fetch(url) {
  try {
    const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
    const data = execSync(
      `curl -s ${proxyFlag} -L --max-time 30 -H "User-Agent: Mozilla/5.0 (compatible; AINewsCollector/1.0)" "${url}"`,
      { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 }
    );
    return { data, status: 200 };
  } catch (err) {
    throw new Error(`Fetch failed: ${err.message}`);
  }
}

// ============ 数据采集器 ============

async function collectArxiv(config) {
  const items = [];
  const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG'];

  // 计算48小时前的日期（用于过滤）
  const now = new Date();
  const twoDaysAgo = new Date(now.getTime() - 48 * 60 * 60 * 1000);

  for (const cat of categories) {
    try {
      // 使用 lastUpdatedDate 排序获取最新论文，然后在代码里过滤
      const url = `https://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&max_results=${config.maxResults || 50}`;

      console.log(`   查询: ${cat}`);
      const { data } = await fetch(url);

      const entries = data.split('<entry>').slice(1);

      for (const entry of entries) {
        const title = entry.match(/<title>([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' ');
        const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200);
        const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim();
        const updated = entry.match(/<updated>([\s\S]*?)<\/updated>/)?.[1]?.trim();
        const published = entry.match(/<published>([\s\S]*?)<\/published>/)?.[1]?.trim();

        if (title && link) {
          const publishedDate = published ? new Date(published) : null;

          // 只保留最近48小时内发布的论文
          if (publishedDate && publishedDate >= twoDaysAgo) {
            items.push({
              title,
              summary: summary || '',
              url: link,
              source: 'arXiv',
              sourceWeight: config.weight || 6,
              date: published ? published.split('T')[0] : new Date().toISOString().split('T')[0]
            });
          }
        }
      }
    } catch (err) {
      console.error(`[arXiv] Error for ${cat}:`, err.message);
    }
  }
  return items;
}

async function collectHuggingFace(config) {
  const items = [];
  try {
    // 使用正确的 API 端点：daily_papers（下划线），不需要认证
    const { data } = await fetch(`https://huggingface.co/api/daily_papers?limit=${config.maxResults || 30}`);
    const papers = JSON.parse(data);

    if (!Array.isArray(papers)) {
      console.error('[HuggingFace] Error: API返回的不是数组');
      return items;
    }

    for (const paper of papers) {
      items.push({
        title: paper.title || paper.paper?.title || 'Untitled',
        summary: (paper.summary || '').slice(0, 200),
        url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`,
        source: 'Hugging Face',
        sourceWeight: config.weight || 8,
        date: new Date().toISOString().split('T')[0]
      });
    }
  } catch (err) {
    console.error('[HuggingFace] Error:', err.message);
  }
  return items;
}

async function collectGitHub(config) {
  const items = [];
  try {
    const { data } = await fetch('https://github.com/trending?since=daily');
    const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || [];

    for (const repo of repoMatches.slice(0, config.maxResults || 30)) {
      const href = repo.match(/href="\/([^"]+)"/)?.[1];
      const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, '');
      const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0';

      if (href) {
        items.push({
          title: href,
          summary: (desc || '').slice(0, 200),
          url: `https://github.com/${href}`,
          source: 'GitHub Trending',
          sourceWeight: config.weight || 7,
          stars: parseInt(stars.replace(/,/g, '')) || 0,
          date: new Date().toISOString().split('T')[0]
        });
      }
    }
  } catch (err) {
    console.error('[GitHub] Error:', err.message);
  }
  return items;
}

// ============ 处理模块 ============

function calculateKeywordScore(item, topics) {
  const text = `${item.title} ${item.summary}`.toLowerCase();
  let score = 0;
  for (const topic of topics) {
    for (const keyword of topic.keywords) {
      if (text.includes(keyword.toLowerCase())) score += 2;
    }
  }
  return score;
}

function deduplicate(items, seenUrls) {
  return items.filter(item => {
    const urlKey = item.url.toLowerCase().replace(/\/$/, '');
    if (seenUrls.has(urlKey)) return false;
    seenUrls.add(urlKey);
    return true;
  });
}

function sortItems(items, topics) {
  return items.map(item => ({
    ...item,
    finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0)
  })).sort((a, b) => b.finalScore - a.finalScore);
}

// ============ 输出模块 ============

function generateMarkdown(items, topCount, topics) {
  const date = new Date().toLocaleDateString('zh-CN', {
    year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai'
  }).replace(/\//g, '-');

  const top10 = items.slice(0, topCount);

  let md = `# AI Daily Brief - ${date}\n\n`;
  md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`;
  md += `> 总条目: ${items.length}\n\n`;

  md += `## 🔥 Top ${topCount} 重要消息\n\n`;
  for (let i = 0; i < top10.length; i++) {
    const item = top10[i];
    md += `${i + 1}. [${item.title}](${item.url}) - **${item.source}**\n`;
    if (item.summary) md += `   > ${item.summary.slice(0, 150)}${item.summary.length > 150 ? '...' : ''}\n`;
    md += '\n';
  }

  md += `## 📂 分类汇总\n\n`;

  for (const topic of topics) {
    const topicItems = items.filter(item => {
      const text = `${item.title} ${item.summary}`.toLowerCase();
      return topic.keywords.some(k => text.includes(k.toLowerCase()));
    }).filter(item => !top10.includes(item));

    if (topicItems.length > 0) {
      md += `### ${topic.name}\n\n`;
      for (const item of topicItems.slice(0, 10)) {
        md += `- [${item.title}](${item.url}) - ${item.source}\n`;
      }
      md += '\n';
    }
  }

  md += `---\n*Generated by AINewsCollector*\n`;

  return { md, date };
}

// ============ 主流程 ============

async function main() {
  console.log('🚀 AINewsCollector 开始运行...\n');

  const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));

  let seenUrls = new Set();
  if (fs.existsSync(CACHE_PATH)) {
    const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
    seenUrls = new Set(cache.urls || []);
  }

  let allItems = [];

  console.log('📡 采集 arXiv...');
  const arxivItems = await collectArxiv(config.sources.arxiv);
  allItems.push(...arxivItems);
  console.log(`   获取 ${arxivItems.length} 条\n`);

  console.log('📡 采集 Hugging Face Papers...');
  const hfItems = await collectHuggingFace(config.sources.huggingface);
  allItems.push(...hfItems);
  console.log(`   获取 ${hfItems.length} 条\n`);

  console.log('📡 采集 GitHub Trending...');
  const ghItems = await collectGitHub(config.sources.github);
  allItems.push(...ghItems);
  console.log(`   获取 ${ghItems.length} 条\n`);

  const originalCount = allItems.length;
  allItems = deduplicate(allItems, seenUrls);
  console.log(`🔄 去重: ${originalCount} → ${allItems.length}\n`);

  allItems = sortItems(allItems, config.topics);

  const { md, date } = generateMarkdown(allItems, config.output.topCount, config.topics);

  if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true });
  const outputPath = path.join(DAILY_DIR, `${date}.md`);
  fs.writeFileSync(outputPath, md);
  console.log(`📝 简报已保存: ${outputPath}\n`);

  const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) };
  fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2));

  console.log('✅ 采集完成!\n');

  return { success: true, outputPath, date, itemCount: allItems.length, content: md };
}

if (require.main === module) {
  main().catch(err => { console.error('❌ Error:', err); process.exit(1); });
}

module.exports = { main };