Add: 核心采集脚本、配置、飞书推送模块

2026-02-23 00:16:08 +08:00 · 2026-02-23 00:16:08 +08:00 · a1591b12f2
commit a1591b12f2
parent 8564190abc
7 changed files with 484 additions and 2 deletions
--- a/cache/seen_urls.json
+++ b/cache/seen_urls.json
@ -1,4 +1,4 @@
 {
-  "lastUpdate": "",
+  "lastUpdate": "2026-02-22T16:15:49.916Z",
  "urls": []
-}
+}
--- a/config.json
+++ b/config.json
@ -0,0 +1,38 @@
 {
  "topics": [
    {
      "name": "AI 编程工具 / Code Agent",
      "keywords": ["copilot", "cursor", "code agent", "code generation", "code assistant", "ai coding", "llm programming", "autocomplete", "code completion"]
    },
    {
      "name": "Agent 框架",
      "keywords": ["langchain", "llamaindex", "crewai", "autogen", "agent framework", "multi-agent", "tool use", "function calling", "agent workflow"]
    },
    {
      "name": "AI 基础设施 / 推理优化",
      "keywords": ["inference", "optimization", "vllm", "tensorrt", "onnx", "quantization", "distributed training", "model serving", "gpu optimization", "cuda", "triton", "speculative decoding"]
    }
  ],
  "sources": {
    "arxiv": {
      "enabled": true,
      "categories": ["cs.AI", "cs.CL", "cs.LG"],
      "weight": 6,
      "keywordMatchWeight": 10,
      "maxResults": 50
    },
    "huggingface": {
      "enabled": true,
      "weight": 8,
      "maxResults": 30
    },
    "github": {
      "enabled": true,
      "weight": 7,
      "maxResults": 30
    }
  },
  "output": {
    "topCount": 10
  }
 }
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,98 @@
 # AINewsCollector 配置文件
 # 话题关键词配置
 topics:
  - name: AI 编程工具 / Code Agent
    keywords:
      - copilot
      - cursor
      - code agent
      - code generation
      - code assistant
      - ai coding
      - llm programming
      - autocomplete
      - code completion
  - name: Agent 框架
    keywords:
      - langchain
      - llamaindex
      - crewai
      - autogen
      - agent framework
      - multi-agent
      - tool use
      - function calling
      - agent workflow
  - name: AI 基础设施 / 推理优化
    keywords:
      - inference
      - optimization
      - vllm
      - tensorrt
      - onnx
      - quantization
      - distributed training
      - model serving
      - gpu optimization
      - cuda
      - triton
      - speculative decoding
 # 来源配置
 sources:
  arxiv:
    enabled: true
    categories:
      - cs.AI
      - cs.CL
      - cs.LG
    weight: 6
    keywordMatchWeight: 10
    maxResults: 50
  huggingface:
    enabled: true
    weight: 8
    maxResults: 30
  github:
    enabled: true
    weight: 7
    maxResults: 30
  blogs:
    enabled: true
    sources:
      - name: Anthropic
        url: https://www.anthropic.com/news
        weight: 9
      - name: OpenAI
        url: https://openai.com/blog
        weight: 9
      - name: DeepMind
        url: https://deepmind.google/discover/blog
        weight: 8
 # 去重配置
 dedup:
  cacheFile: cache/seen_urls.json
  retentionDays: 30
 # 输出配置
 output:
  dailyDir: daily
  topCount: 10
 # Gitea 配置
 gitea:
  url: https://gitea.cynthia.life
  repo: chen/AINewsCollector
  tokenFile: ../credentials/gitea.md
 # 飞书推送配置
 feishu:
  enabled: true
  pushTime: "09:00"
--- a/daily/2026-02-23.md
+++ b/daily/2026-02-23.md
@ -0,0 +1,11 @@
 # AI Daily Brief - 2026-02-23
 > 采集时间: 2026/2/23 00:15:49
 > 总条目: 0
 ## 🔥 Top 10 重要消息
 ## 📂 分类汇总
 ---
 *Generated by AINewsCollector*
--- a/skill/ai-news-collector/SKILL.md
+++ b/skill/ai-news-collector/SKILL.md
@ -0,0 +1,35 @@
 # AINewsCollector Skill
 AI 圈最新消息自动收集与整理。
 ## 功能
 - 从 arXiv、Hugging Face Papers、GitHub Trending 采集 AI 相关信息
 - 基于话题关键词筛选和排序
 - 生成每日简报 (.md)
 - 提交到 Gitea 仓库
 - 推送至飞书
 ## 话题
 - AI 编程工具 / Code Agent
 - Agent 框架
 - AI 基础设施 / 推理优化
 ## 使用
 ### 手动触发
 ```
 采集 AI 新闻
 ```
 ### 定时任务
 - 23:00 收集任务
 - 09:00 推送任务
 ## 文件
 - `collect.js` - 核心采集脚本
 - `push.js` - 飞书推送脚本
--- a/skill/ai-news-collector/collect.js
+++ b/skill/ai-news-collector/collect.js
@ -0,0 +1,251 @@
 #!/usr/bin/env node
 /**
 * AINewsCollector - AI 新闻采集核心脚本
 */
 const fs = require('fs');
 const path = require('path');
 const https = require('https');
 const http = require('http');
 // 配置
 const CONFIG_PATH = path.join(__dirname, '../../config.json');
 const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json');
 const DAILY_DIR = path.join(__dirname, '../../daily');
 // HTTP 请求封装
 function fetch(url) {
  return new Promise((resolve, reject) => {
    const client = url.startsWith('https') ? https : http;
    const req = client.get(url, {
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; AINewsCollector/1.0)',
        'Accept': 'application/json, text/html, */*'
      },
      timeout: 30000
    }, (res) => {
      let data = '';
      res.on('data', chunk => data += chunk);
      res.on('end', () => resolve({ data, status: res.statusCode }));
    });
    req.on('error', reject);
    req.on('timeout', () => { req.destroy(); reject(new Error('Timeout')); });
  });
 }
 // ============ 数据采集器 ============
 async function collectArxiv(config) {
  const items = [];
  const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG'];
  for (const cat of categories) {
    try {
      const url = `http://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&maxResults=${config.maxResults || 50}`;
      const { data } = await fetch(url);
      const entries = data.split('<entry>').slice(1);
      for (const entry of entries) {
        const title = entry.match(/<title>([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' ');
        const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200);
        const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim();
        if (title && link) {
          items.push({
            title,
            summary: summary || '',
            url: link,
            source: 'arXiv',
            sourceWeight: config.weight || 6,
            date: new Date().toISOString().split('T')[0]
          });
        }
      }
    } catch (err) {
      console.error(`[arXiv] Error:`, err.message);
    }
  }
  return items;
 }
 async function collectHuggingFace(config) {
  const items = [];
  try {
    const { data } = await fetch(`https://huggingface.co/api/daily-papers?limit=${config.maxResults || 30}`);
    const papers = JSON.parse(data);
    for (const paper of papers) {
      items.push({
        title: paper.title || paper.paper?.title || 'Untitled',
        summary: (paper.summary || '').slice(0, 200),
        url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`,
        source: 'Hugging Face',
        sourceWeight: config.weight || 8,
        date: new Date().toISOString().split('T')[0]
      });
    }
  } catch (err) {
    console.error('[HuggingFace] Error:', err.message);
  }
  return items;
 }
 async function collectGitHub(config) {
  const items = [];
  try {
    const { data } = await fetch('https://github.com/trending?since=daily');
    const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || [];
    for (const repo of repoMatches.slice(0, config.maxResults || 30)) {
      const href = repo.match(/href="\/([^"]+)"/)?.[1];
      const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, '');
      const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0';
      if (href) {
        items.push({
          title: href,
          summary: (desc || '').slice(0, 200),
          url: `https://github.com/${href}`,
          source: 'GitHub Trending',
          sourceWeight: config.weight || 7,
          stars: parseInt(stars.replace(/,/g, '')) || 0,
          date: new Date().toISOString().split('T')[0]
        });
      }
    }
  } catch (err) {
    console.error('[GitHub] Error:', err.message);
  }
  return items;
 }
 // ============ 处理模块 ============
 function calculateKeywordScore(item, topics) {
  const text = `${item.title} ${item.summary}`.toLowerCase();
  let score = 0;
  for (const topic of topics) {
    for (const keyword of topic.keywords) {
      if (text.includes(keyword.toLowerCase())) score += 2;
    }
  }
  return score;
 }
 function deduplicate(items, seenUrls) {
  return items.filter(item => {
    const urlKey = item.url.toLowerCase().replace(/\/$/, '');
    if (seenUrls.has(urlKey)) return false;
    seenUrls.add(urlKey);
    return true;
  });
 }
 function sortItems(items, topics) {
  return items.map(item => ({
    ...item,
    finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0)
  })).sort((a, b) => b.finalScore - a.finalScore);
 }
 // ============ 输出模块 ============
 function generateMarkdown(items, topCount, topics) {
  const date = new Date().toLocaleDateString('zh-CN', {
    year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai'
  }).replace(/\//g, '-');
  const top10 = items.slice(0, topCount);
  let md = `# AI Daily Brief - ${date}\n\n`;
  md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`;
  md += `> 总条目: ${items.length}\n\n`;
  md += `## 🔥 Top ${topCount} 重要消息\n\n`;
  for (let i = 0; i < top10.length; i++) {
    const item = top10[i];
    md += `${i + 1}. [${item.title}](${item.url}) - **${item.source}**\n`;
    if (item.summary) md += `   > ${item.summary.slice(0, 150)}${item.summary.length > 150 ? '...' : ''}\n`;
    md += '\n';
  }
  md += `## 📂 分类汇总\n\n`;
  for (const topic of topics) {
    const topicItems = items.filter(item => {
      const text = `${item.title} ${item.summary}`.toLowerCase();
      return topic.keywords.some(k => text.includes(k.toLowerCase()));
    }).filter(item => !top10.includes(item));
    if (topicItems.length > 0) {
      md += `### ${topic.name}\n\n`;
      for (const item of topicItems.slice(0, 10)) {
        md += `- [${item.title}](${item.url}) - ${item.source}\n`;
      }
      md += '\n';
    }
  }
  md += `---\n*Generated by AINewsCollector*\n`;
  return { md, date };
 }
 // ============ 主流程 ============
 async function main() {
  console.log('🚀 AINewsCollector 开始运行...\n');
  const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
  let seenUrls = new Set();
  if (fs.existsSync(CACHE_PATH)) {
    const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
    seenUrls = new Set(cache.urls || []);
  }
  let allItems = [];
  console.log('📡 采集 arXiv...');
  const arxivItems = await collectArxiv(config.sources.arxiv);
  allItems.push(...arxivItems);
  console.log(`   获取 ${arxivItems.length} 条\n`);
  console.log('📡 采集 Hugging Face Papers...');
  const hfItems = await collectHuggingFace(config.sources.huggingface);
  allItems.push(...hfItems);
  console.log(`   获取 ${hfItems.length} 条\n`);
  console.log('📡 采集 GitHub Trending...');
  const ghItems = await collectGitHub(config.sources.github);
  allItems.push(...ghItems);
  console.log(`   获取 ${ghItems.length} 条\n`);
  const originalCount = allItems.length;
  allItems = deduplicate(allItems, seenUrls);
  console.log(`🔄 去重: ${originalCount} → ${allItems.length}\n`);
  allItems = sortItems(allItems, config.topics);
  const { md, date } = generateMarkdown(allItems, config.output.topCount, config.topics);
  if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true });
  const outputPath = path.join(DAILY_DIR, `${date}.md`);
  fs.writeFileSync(outputPath, md);
  console.log(`📝 简报已保存: ${outputPath}\n`);
  const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) };
  fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2));
  console.log('✅ 采集完成!\n');
  return { success: true, outputPath, date, itemCount: allItems.length, content: md };
 }
 if (require.main === module) {
  main().catch(err => { console.error('❌ Error:', err); process.exit(1); });
 }
 module.exports = { main };
--- a/skill/ai-news-collector/push.js
+++ b/skill/ai-news-collector/push.js
@ -0,0 +1,49 @@
 #!/usr/bin/env node
 /**
 * 飞书推送脚本
 * 将昨日简报推送至飞书
 */
 const fs = require('fs');
 const path = require('path');
 const DAILY_DIR = path.join(__dirname, '../../daily');
 function getYesterdayDate() {
  const d = new Date();
  d.setDate(d.getDate() - 1);
  return d.toLocaleDateString('zh-CN', {
    year: 'numeric',
    month: '2-digit',
    day: '2-digit',
    timeZone: 'Asia/Shanghai'
  }).replace(/\//g, '-');
 }
 async function main() {
  const yesterday = getYesterdayDate();
  const reportPath = path.join(DAILY_DIR, `${yesterday}.md`);
  if (!fs.existsSync(reportPath)) {
    console.log(`❌ 未找到昨日简报: ${reportPath}`);
    return { success: false, error: 'Report not found' };
  }
  const content = fs.readFileSync(reportPath, 'utf8');
  console.log(`📋 昨日简报 (${yesterday}):\n`);
  console.log(content.slice(0, 500) + '...\n');
  return {
    success: true,
    date: yesterday,
    content
  };
 }
 if (require.main === module) {
  main().catch(console.error);
 }
 module.exports = { main };