Add: 核心采集脚本、配置、飞书推送模块

2026-02-23 00:16:08 +08:00 · 2026-02-23 00:16:08 +08:00 · a1591b12f2
commit a1591b12f2
parent 8564190abc
7 changed files with 484 additions and 2 deletions
--- a/cache/seen_urls.json
+++ b/cache/seen_urls.json
@ -1,4 +1,4 @@
 {
-  "lastUpdate": "",
+  "lastUpdate": "2026-02-22T16:15:49.916Z",
  "urls": []
-}
+}
--- a/config.json
+++ b/config.json
@ -0,0 +1,38 @@
+{
+  "topics": [
+    {
+      "name": "AI 编程工具 / Code Agent",
+      "keywords": ["copilot", "cursor", "code agent", "code generation", "code assistant", "ai coding", "llm programming", "autocomplete", "code completion"]
+    },
+    {
+      "name": "Agent 框架",
+      "keywords": ["langchain", "llamaindex", "crewai", "autogen", "agent framework", "multi-agent", "tool use", "function calling", "agent workflow"]
+    },
+    {
+      "name": "AI 基础设施 / 推理优化",
+      "keywords": ["inference", "optimization", "vllm", "tensorrt", "onnx", "quantization", "distributed training", "model serving", "gpu optimization", "cuda", "triton", "speculative decoding"]
+    }
+  ],
+  "sources": {
+    "arxiv": {
+      "enabled": true,
+      "categories": ["cs.AI", "cs.CL", "cs.LG"],
+      "weight": 6,
+      "keywordMatchWeight": 10,
+      "maxResults": 50
+    },
+    "huggingface": {
+      "enabled": true,
+      "weight": 8,
+      "maxResults": 30
+    },
+    "github": {
+      "enabled": true,
+      "weight": 7,
+      "maxResults": 30
+    }
+  },
+  "output": {
+    "topCount": 10
+  }
+}
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,98 @@
+# AINewsCollector 配置文件
+
+# 话题关键词配置
+topics:
+  - name: AI 编程工具 / Code Agent
+    keywords:
+      - copilot
+      - cursor
+      - code agent
+      - code generation
+      - code assistant
+      - ai coding
+      - llm programming
+      - autocomplete
+      - code completion
+
+  - name: Agent 框架
+    keywords:
+      - langchain
+      - llamaindex
+      - crewai
+      - autogen
+      - agent framework
+      - multi-agent
+      - tool use
+      - function calling
+      - agent workflow
+
+  - name: AI 基础设施 / 推理优化
+    keywords:
+      - inference
+      - optimization
+      - vllm
+      - tensorrt
+      - onnx
+      - quantization
+      - distributed training
+      - model serving
+      - gpu optimization
+      - cuda
+      - triton
+      - speculative decoding
+
+# 来源配置
+sources:
+  arxiv:
+    enabled: true
+    categories:
+      - cs.AI
+      - cs.CL
+      - cs.LG
+    weight: 6
+    keywordMatchWeight: 10
+    maxResults: 50
+
+  huggingface:
+    enabled: true
+    weight: 8
+    maxResults: 30
+
+  github:
+    enabled: true
+    weight: 7
+    maxResults: 30
+
+  blogs:
+    enabled: true
+    sources:
+      - name: Anthropic
+        url: https://www.anthropic.com/news
+        weight: 9
+      - name: OpenAI
+        url: https://openai.com/blog
+        weight: 9
+      - name: DeepMind
+        url: https://deepmind.google/discover/blog
+        weight: 8
+
+# 去重配置
+dedup:
+  cacheFile: cache/seen_urls.json
+  retentionDays: 30
+
+# 输出配置
+output:
+  dailyDir: daily
+  topCount: 10
+
+# Gitea 配置
+gitea:
+  url: https://gitea.cynthia.life
+  repo: chen/AINewsCollector
+  tokenFile: ../credentials/gitea.md
+
+# 飞书推送配置
+feishu:
+  enabled: true
+  pushTime: "09:00"
--- a/daily/2026-02-23.md
+++ b/daily/2026-02-23.md
@ -0,0 +1,11 @@
+# AI Daily Brief - 2026-02-23
+
+> 采集时间: 2026/2/23 00:15:49
+> 总条目: 0
+
+## 🔥 Top 10 重要消息
+
+## 📂 分类汇总
+
+---
+*Generated by AINewsCollector*
--- a/skill/ai-news-collector/SKILL.md
+++ b/skill/ai-news-collector/SKILL.md
@ -0,0 +1,35 @@
+# AINewsCollector Skill
+
+AI 圈最新消息自动收集与整理。
+
+## 功能
+
+- 从 arXiv、Hugging Face Papers、GitHub Trending 采集 AI 相关信息
+- 基于话题关键词筛选和排序
+- 生成每日简报 (.md)
+- 提交到 Gitea 仓库
+- 推送至飞书
+
+## 话题
+
+- AI 编程工具 / Code Agent
+- Agent 框架
+- AI 基础设施 / 推理优化
+
+## 使用
+
+### 手动触发
+
+```
+采集 AI 新闻
+```
+
+### 定时任务
+
+- 23:00 收集任务
+- 09:00 推送任务
+
+## 文件
+
+- `collect.js` - 核心采集脚本
+- `push.js` - 飞书推送脚本
--- a/skill/ai-news-collector/collect.js
+++ b/skill/ai-news-collector/collect.js
@ -0,0 +1,251 @@
+#!/usr/bin/env node
+
+/**
+ * AINewsCollector - AI 新闻采集核心脚本
+ */
+
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const http = require('http');
+
+// 配置
+const CONFIG_PATH = path.join(__dirname, '../../config.json');
+const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json');
+const DAILY_DIR = path.join(__dirname, '../../daily');
+
+// HTTP 请求封装
+function fetch(url) {
+  return new Promise((resolve, reject) => {
+    const client = url.startsWith('https') ? https : http;
+    const req = client.get(url, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (compatible; AINewsCollector/1.0)',
+        'Accept': 'application/json, text/html, */*'
+      },
+      timeout: 30000
+    }, (res) => {
+      let data = '';
+      res.on('data', chunk => data += chunk);
+      res.on('end', () => resolve({ data, status: res.statusCode }));
+    });
+    req.on('error', reject);
+    req.on('timeout', () => { req.destroy(); reject(new Error('Timeout')); });
+  });
+}
+
+// ============ 数据采集器 ============
+
+async function collectArxiv(config) {
+  const items = [];
+  const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG'];
+  
+  for (const cat of categories) {
+    try {
+      const url = `http://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&maxResults=${config.maxResults || 50}`;
+      const { data } = await fetch(url);
+      
+      const entries = data.split('<entry>').slice(1);
+      
+      for (const entry of entries) {
+        const title = entry.match(/<title>([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' ');
+        const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200);
+        const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim();
+        
+        if (title && link) {
+          items.push({
+            title,
+            summary: summary || '',
+            url: link,
+            source: 'arXiv',
+            sourceWeight: config.weight || 6,
+            date: new Date().toISOString().split('T')[0]
+          });
+        }
+      }
+    } catch (err) {
+      console.error(`[arXiv] Error:`, err.message);
+    }
+  }
+  return items;
+}
+
+async function collectHuggingFace(config) {
+  const items = [];
+  try {
+    const { data } = await fetch(`https://huggingface.co/api/daily-papers?limit=${config.maxResults || 30}`);
+    const papers = JSON.parse(data);
+    
+    for (const paper of papers) {
+      items.push({
+        title: paper.title || paper.paper?.title || 'Untitled',
+        summary: (paper.summary || '').slice(0, 200),
+        url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`,
+        source: 'Hugging Face',
+        sourceWeight: config.weight || 8,
+        date: new Date().toISOString().split('T')[0]
+      });
+    }
+  } catch (err) {
+    console.error('[HuggingFace] Error:', err.message);
+  }
+  return items;
+}
+
+async function collectGitHub(config) {
+  const items = [];
+  try {
+    const { data } = await fetch('https://github.com/trending?since=daily');
+    const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || [];
+    
+    for (const repo of repoMatches.slice(0, config.maxResults || 30)) {
+      const href = repo.match(/href="\/([^"]+)"/)?.[1];
+      const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, '');
+      const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0';
+      
+      if (href) {
+        items.push({
+          title: href,
+          summary: (desc || '').slice(0, 200),
+          url: `https://github.com/${href}`,
+          source: 'GitHub Trending',
+          sourceWeight: config.weight || 7,
+          stars: parseInt(stars.replace(/,/g, '')) || 0,
+          date: new Date().toISOString().split('T')[0]
+        });
+      }
+    }
+  } catch (err) {
+    console.error('[GitHub] Error:', err.message);
+  }
+  return items;
+}
+
+// ============ 处理模块 ============
+
+function calculateKeywordScore(item, topics) {
+  const text = `${item.title} ${item.summary}`.toLowerCase();
+  let score = 0;
+  for (const topic of topics) {
+    for (const keyword of topic.keywords) {
+      if (text.includes(keyword.toLowerCase())) score += 2;
+    }
+  }
+  return score;
+}
+
+function deduplicate(items, seenUrls) {
+  return items.filter(item => {
+    const urlKey = item.url.toLowerCase().replace(/\/$/, '');
+    if (seenUrls.has(urlKey)) return false;
+    seenUrls.add(urlKey);
+    return true;
+  });
+}
+
+function sortItems(items, topics) {
+  return items.map(item => ({
+    ...item,
+    finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0)
+  })).sort((a, b) => b.finalScore - a.finalScore);
+}
+
+// ============ 输出模块 ============
+
+function generateMarkdown(items, topCount, topics) {
+  const date = new Date().toLocaleDateString('zh-CN', {
+    year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai'
+  }).replace(/\//g, '-');
+  
+  const top10 = items.slice(0, topCount);
+  
+  let md = `# AI Daily Brief - ${date}\n\n`;
+  md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`;
+  md += `> 总条目: ${items.length}\n\n`;
+  
+  md += `## 🔥 Top ${topCount} 重要消息\n\n`;
+  for (let i = 0; i < top10.length; i++) {
+    const item = top10[i];
+    md += `${i + 1}. [${item.title}](${item.url}) - **${item.source}**\n`;
+    if (item.summary) md += `   > ${item.summary.slice(0, 150)}${item.summary.length > 150 ? '...' : ''}\n`;
+    md += '\n';
+  }
+  
+  md += `## 📂 分类汇总\n\n`;
+  
+  for (const topic of topics) {
+    const topicItems = items.filter(item => {
+      const text = `${item.title} ${item.summary}`.toLowerCase();
+      return topic.keywords.some(k => text.includes(k.toLowerCase()));
+    }).filter(item => !top10.includes(item));
+    
+    if (topicItems.length > 0) {
+      md += `### ${topic.name}\n\n`;
+      for (const item of topicItems.slice(0, 10)) {
+        md += `- [${item.title}](${item.url}) - ${item.source}\n`;
+      }
+      md += '\n';
+    }
+  }
+  
+  md += `---\n*Generated by AINewsCollector*\n`;
+  
+  return { md, date };
+}
+
+// ============ 主流程 ============
+
+async function main() {
+  console.log('🚀 AINewsCollector 开始运行...\n');
+  
+  const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
+  
+  let seenUrls = new Set();
+  if (fs.existsSync(CACHE_PATH)) {
+    const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
+    seenUrls = new Set(cache.urls || []);
+  }
+  
+  let allItems = [];
+  
+  console.log('📡 采集 arXiv...');
+  const arxivItems = await collectArxiv(config.sources.arxiv);
+  allItems.push(...arxivItems);
+  console.log(`   获取 ${arxivItems.length} 条\n`);
+  
+  console.log('📡 采集 Hugging Face Papers...');
+  const hfItems = await collectHuggingFace(config.sources.huggingface);
+  allItems.push(...hfItems);
+  console.log(`   获取 ${hfItems.length} 条\n`);
+  
+  console.log('📡 采集 GitHub Trending...');
+  const ghItems = await collectGitHub(config.sources.github);
+  allItems.push(...ghItems);
+  console.log(`   获取 ${ghItems.length} 条\n`);
+  
+  const originalCount = allItems.length;
+  allItems = deduplicate(allItems, seenUrls);
+  console.log(`🔄 去重: ${originalCount} → ${allItems.length}\n`);
+  
+  allItems = sortItems(allItems, config.topics);
+  
+  const { md, date } = generateMarkdown(allItems, config.output.topCount, config.topics);
+  
+  if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true });
+  const outputPath = path.join(DAILY_DIR, `${date}.md`);
+  fs.writeFileSync(outputPath, md);
+  console.log(`📝 简报已保存: ${outputPath}\n`);
+  
+  const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) };
+  fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2));
+  
+  console.log('✅ 采集完成!\n');
+  
+  return { success: true, outputPath, date, itemCount: allItems.length, content: md };
+}
+
+if (require.main === module) {
+  main().catch(err => { console.error('❌ Error:', err); process.exit(1); });
+}
+
+module.exports = { main };
--- a/skill/ai-news-collector/push.js
+++ b/skill/ai-news-collector/push.js
@ -0,0 +1,49 @@
+#!/usr/bin/env node
+
+/**
+ * 飞书推送脚本
+ * 将昨日简报推送至飞书
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+const DAILY_DIR = path.join(__dirname, '../../daily');
+
+function getYesterdayDate() {
+  const d = new Date();
+  d.setDate(d.getDate() - 1);
+  return d.toLocaleDateString('zh-CN', {
+    year: 'numeric',
+    month: '2-digit',
+    day: '2-digit',
+    timeZone: 'Asia/Shanghai'
+  }).replace(/\//g, '-');
+}
+
+async function main() {
+  const yesterday = getYesterdayDate();
+  const reportPath = path.join(DAILY_DIR, `${yesterday}.md`);
+  
+  if (!fs.existsSync(reportPath)) {
+    console.log(`❌ 未找到昨日简报: ${reportPath}`);
+    return { success: false, error: 'Report not found' };
+  }
+  
+  const content = fs.readFileSync(reportPath, 'utf8');
+  
+  console.log(`📋 昨日简报 (${yesterday}):\n`);
+  console.log(content.slice(0, 500) + '...\n');
+  
+  return {
+    success: true,
+    date: yesterday,
+    content
+  };
+}
+
+if (require.main === module) {
+  main().catch(console.error);
+}
+
+module.exports = { main };