diff --git a/cache/seen_urls.json b/cache/seen_urls.json index 4908b1e..26dfeff 100644 --- a/cache/seen_urls.json +++ b/cache/seen_urls.json @@ -1,4 +1,4 @@ { - "lastUpdate": "", + "lastUpdate": "2026-02-22T16:15:49.916Z", "urls": [] -} +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e711543 --- /dev/null +++ b/config.json @@ -0,0 +1,38 @@ +{ + "topics": [ + { + "name": "AI 编程工具 / Code Agent", + "keywords": ["copilot", "cursor", "code agent", "code generation", "code assistant", "ai coding", "llm programming", "autocomplete", "code completion"] + }, + { + "name": "Agent 框架", + "keywords": ["langchain", "llamaindex", "crewai", "autogen", "agent framework", "multi-agent", "tool use", "function calling", "agent workflow"] + }, + { + "name": "AI 基础设施 / 推理优化", + "keywords": ["inference", "optimization", "vllm", "tensorrt", "onnx", "quantization", "distributed training", "model serving", "gpu optimization", "cuda", "triton", "speculative decoding"] + } + ], + "sources": { + "arxiv": { + "enabled": true, + "categories": ["cs.AI", "cs.CL", "cs.LG"], + "weight": 6, + "keywordMatchWeight": 10, + "maxResults": 50 + }, + "huggingface": { + "enabled": true, + "weight": 8, + "maxResults": 30 + }, + "github": { + "enabled": true, + "weight": 7, + "maxResults": 30 + } + }, + "output": { + "topCount": 10 + } +} diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..6423f7b --- /dev/null +++ b/config.yaml @@ -0,0 +1,98 @@ +# AINewsCollector 配置文件 + +# 话题关键词配置 +topics: + - name: AI 编程工具 / Code Agent + keywords: + - copilot + - cursor + - code agent + - code generation + - code assistant + - ai coding + - llm programming + - autocomplete + - code completion + + - name: Agent 框架 + keywords: + - langchain + - llamaindex + - crewai + - autogen + - agent framework + - multi-agent + - tool use + - function calling + - agent workflow + + - name: AI 基础设施 / 推理优化 + keywords: + - inference + - optimization + - vllm + - tensorrt + - onnx + - quantization + - distributed training + - model serving + - gpu optimization + - cuda + - triton + - speculative decoding + +# 来源配置 +sources: + arxiv: + enabled: true + categories: + - cs.AI + - cs.CL + - cs.LG + weight: 6 + keywordMatchWeight: 10 + maxResults: 50 + + huggingface: + enabled: true + weight: 8 + maxResults: 30 + + github: + enabled: true + weight: 7 + maxResults: 30 + + blogs: + enabled: true + sources: + - name: Anthropic + url: https://www.anthropic.com/news + weight: 9 + - name: OpenAI + url: https://openai.com/blog + weight: 9 + - name: DeepMind + url: https://deepmind.google/discover/blog + weight: 8 + +# 去重配置 +dedup: + cacheFile: cache/seen_urls.json + retentionDays: 30 + +# 输出配置 +output: + dailyDir: daily + topCount: 10 + +# Gitea 配置 +gitea: + url: https://gitea.cynthia.life + repo: chen/AINewsCollector + tokenFile: ../credentials/gitea.md + +# 飞书推送配置 +feishu: + enabled: true + pushTime: "09:00" diff --git a/daily/2026-02-23.md b/daily/2026-02-23.md new file mode 100644 index 0000000..ea66d35 --- /dev/null +++ b/daily/2026-02-23.md @@ -0,0 +1,11 @@ +# AI Daily Brief - 2026-02-23 + +> 采集时间: 2026/2/23 00:15:49 +> 总条目: 0 + +## 🔥 Top 10 重要消息 + +## 📂 分类汇总 + +--- +*Generated by AINewsCollector* diff --git a/skill/ai-news-collector/SKILL.md b/skill/ai-news-collector/SKILL.md new file mode 100644 index 0000000..e9f4257 --- /dev/null +++ b/skill/ai-news-collector/SKILL.md @@ -0,0 +1,35 @@ +# AINewsCollector Skill + +AI 圈最新消息自动收集与整理。 + +## 功能 + +- 从 arXiv、Hugging Face Papers、GitHub Trending 采集 AI 相关信息 +- 基于话题关键词筛选和排序 +- 生成每日简报 (.md) +- 提交到 Gitea 仓库 +- 推送至飞书 + +## 话题 + +- AI 编程工具 / Code Agent +- Agent 框架 +- AI 基础设施 / 推理优化 + +## 使用 + +### 手动触发 + +``` +采集 AI 新闻 +``` + +### 定时任务 + +- 23:00 收集任务 +- 09:00 推送任务 + +## 文件 + +- `collect.js` - 核心采集脚本 +- `push.js` - 飞书推送脚本 diff --git a/skill/ai-news-collector/collect.js b/skill/ai-news-collector/collect.js new file mode 100644 index 0000000..d89eb73 --- /dev/null +++ b/skill/ai-news-collector/collect.js @@ -0,0 +1,251 @@ +#!/usr/bin/env node + +/** + * AINewsCollector - AI 新闻采集核心脚本 + */ + +const fs = require('fs'); +const path = require('path'); +const https = require('https'); +const http = require('http'); + +// 配置 +const CONFIG_PATH = path.join(__dirname, '../../config.json'); +const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json'); +const DAILY_DIR = path.join(__dirname, '../../daily'); + +// HTTP 请求封装 +function fetch(url) { + return new Promise((resolve, reject) => { + const client = url.startsWith('https') ? https : http; + const req = client.get(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; AINewsCollector/1.0)', + 'Accept': 'application/json, text/html, */*' + }, + timeout: 30000 + }, (res) => { + let data = ''; + res.on('data', chunk => data += chunk); + res.on('end', () => resolve({ data, status: res.statusCode })); + }); + req.on('error', reject); + req.on('timeout', () => { req.destroy(); reject(new Error('Timeout')); }); + }); +} + +// ============ 数据采集器 ============ + +async function collectArxiv(config) { + const items = []; + const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG']; + + for (const cat of categories) { + try { + const url = `http://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&maxResults=${config.maxResults || 50}`; + const { data } = await fetch(url); + + const entries = data.split('').slice(1); + + for (const entry of entries) { + const title = entry.match(/([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' '); + const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200); + const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim(); + + if (title && link) { + items.push({ + title, + summary: summary || '', + url: link, + source: 'arXiv', + sourceWeight: config.weight || 6, + date: new Date().toISOString().split('T')[0] + }); + } + } + } catch (err) { + console.error(`[arXiv] Error:`, err.message); + } + } + return items; +} + +async function collectHuggingFace(config) { + const items = []; + try { + const { data } = await fetch(`https://huggingface.co/api/daily-papers?limit=${config.maxResults || 30}`); + const papers = JSON.parse(data); + + for (const paper of papers) { + items.push({ + title: paper.title || paper.paper?.title || 'Untitled', + summary: (paper.summary || '').slice(0, 200), + url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`, + source: 'Hugging Face', + sourceWeight: config.weight || 8, + date: new Date().toISOString().split('T')[0] + }); + } + } catch (err) { + console.error('[HuggingFace] Error:', err.message); + } + return items; +} + +async function collectGitHub(config) { + const items = []; + try { + const { data } = await fetch('https://github.com/trending?since=daily'); + const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || []; + + for (const repo of repoMatches.slice(0, config.maxResults || 30)) { + const href = repo.match(/href="\/([^"]+)"/)?.[1]; + const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, ''); + const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0'; + + if (href) { + items.push({ + title: href, + summary: (desc || '').slice(0, 200), + url: `https://github.com/${href}`, + source: 'GitHub Trending', + sourceWeight: config.weight || 7, + stars: parseInt(stars.replace(/,/g, '')) || 0, + date: new Date().toISOString().split('T')[0] + }); + } + } + } catch (err) { + console.error('[GitHub] Error:', err.message); + } + return items; +} + +// ============ 处理模块 ============ + +function calculateKeywordScore(item, topics) { + const text = `${item.title} ${item.summary}`.toLowerCase(); + let score = 0; + for (const topic of topics) { + for (const keyword of topic.keywords) { + if (text.includes(keyword.toLowerCase())) score += 2; + } + } + return score; +} + +function deduplicate(items, seenUrls) { + return items.filter(item => { + const urlKey = item.url.toLowerCase().replace(/\/$/, ''); + if (seenUrls.has(urlKey)) return false; + seenUrls.add(urlKey); + return true; + }); +} + +function sortItems(items, topics) { + return items.map(item => ({ + ...item, + finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0) + })).sort((a, b) => b.finalScore - a.finalScore); +} + +// ============ 输出模块 ============ + +function generateMarkdown(items, topCount, topics) { + const date = new Date().toLocaleDateString('zh-CN', { + year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai' + }).replace(/\//g, '-'); + + const top10 = items.slice(0, topCount); + + let md = `# AI Daily Brief - ${date}\n\n`; + md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`; + md += `> 总条目: ${items.length}\n\n`; + + md += `## 🔥 Top ${topCount} 重要消息\n\n`; + for (let i = 0; i < top10.length; i++) { + const item = top10[i]; + md += `${i + 1}. [${item.title}](${item.url}) - **${item.source}**\n`; + if (item.summary) md += ` > ${item.summary.slice(0, 150)}${item.summary.length > 150 ? '...' : ''}\n`; + md += '\n'; + } + + md += `## 📂 分类汇总\n\n`; + + for (const topic of topics) { + const topicItems = items.filter(item => { + const text = `${item.title} ${item.summary}`.toLowerCase(); + return topic.keywords.some(k => text.includes(k.toLowerCase())); + }).filter(item => !top10.includes(item)); + + if (topicItems.length > 0) { + md += `### ${topic.name}\n\n`; + for (const item of topicItems.slice(0, 10)) { + md += `- [${item.title}](${item.url}) - ${item.source}\n`; + } + md += '\n'; + } + } + + md += `---\n*Generated by AINewsCollector*\n`; + + return { md, date }; +} + +// ============ 主流程 ============ + +async function main() { + console.log('🚀 AINewsCollector 开始运行...\n'); + + const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8')); + + let seenUrls = new Set(); + if (fs.existsSync(CACHE_PATH)) { + const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8')); + seenUrls = new Set(cache.urls || []); + } + + let allItems = []; + + console.log('📡 采集 arXiv...'); + const arxivItems = await collectArxiv(config.sources.arxiv); + allItems.push(...arxivItems); + console.log(` 获取 ${arxivItems.length} 条\n`); + + console.log('📡 采集 Hugging Face Papers...'); + const hfItems = await collectHuggingFace(config.sources.huggingface); + allItems.push(...hfItems); + console.log(` 获取 ${hfItems.length} 条\n`); + + console.log('📡 采集 GitHub Trending...'); + const ghItems = await collectGitHub(config.sources.github); + allItems.push(...ghItems); + console.log(` 获取 ${ghItems.length} 条\n`); + + const originalCount = allItems.length; + allItems = deduplicate(allItems, seenUrls); + console.log(`🔄 去重: ${originalCount} → ${allItems.length}\n`); + + allItems = sortItems(allItems, config.topics); + + const { md, date } = generateMarkdown(allItems, config.output.topCount, config.topics); + + if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true }); + const outputPath = path.join(DAILY_DIR, `${date}.md`); + fs.writeFileSync(outputPath, md); + console.log(`📝 简报已保存: ${outputPath}\n`); + + const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) }; + fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2)); + + console.log('✅ 采集完成!\n'); + + return { success: true, outputPath, date, itemCount: allItems.length, content: md }; +} + +if (require.main === module) { + main().catch(err => { console.error('❌ Error:', err); process.exit(1); }); +} + +module.exports = { main }; diff --git a/skill/ai-news-collector/push.js b/skill/ai-news-collector/push.js new file mode 100644 index 0000000..b437549 --- /dev/null +++ b/skill/ai-news-collector/push.js @@ -0,0 +1,49 @@ +#!/usr/bin/env node + +/** + * 飞书推送脚本 + * 将昨日简报推送至飞书 + */ + +const fs = require('fs'); +const path = require('path'); + +const DAILY_DIR = path.join(__dirname, '../../daily'); + +function getYesterdayDate() { + const d = new Date(); + d.setDate(d.getDate() - 1); + return d.toLocaleDateString('zh-CN', { + year: 'numeric', + month: '2-digit', + day: '2-digit', + timeZone: 'Asia/Shanghai' + }).replace(/\//g, '-'); +} + +async function main() { + const yesterday = getYesterdayDate(); + const reportPath = path.join(DAILY_DIR, `${yesterday}.md`); + + if (!fs.existsSync(reportPath)) { + console.log(`❌ 未找到昨日简报: ${reportPath}`); + return { success: false, error: 'Report not found' }; + } + + const content = fs.readFileSync(reportPath, 'utf8'); + + console.log(`📋 昨日简报 (${yesterday}):\n`); + console.log(content.slice(0, 500) + '...\n'); + + return { + success: true, + date: yesterday, + content + }; +} + +if (require.main === module) { + main().catch(console.error); +} + +module.exports = { main };