#!/usr/bin/env node /** * AINewsCollector - AI 新闻采集核心脚本 */ const fs = require('fs'); const path = require('path'); const https = require('https'); const http = require('http'); // 配置 const CONFIG_PATH = path.join(__dirname, '../../config.json'); const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json'); const DAILY_DIR = path.join(__dirname, '../../daily'); // HTTP 请求封装 function fetch(url) { return new Promise((resolve, reject) => { const client = url.startsWith('https') ? https : http; const req = client.get(url, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; AINewsCollector/1.0)', 'Accept': 'application/json, text/html, */*' }, timeout: 30000 }, (res) => { let data = ''; res.on('data', chunk => data += chunk); res.on('end', () => resolve({ data, status: res.statusCode })); }); req.on('error', reject); req.on('timeout', () => { req.destroy(); reject(new Error('Timeout')); }); }); } // ============ 数据采集器 ============ async function collectArxiv(config) { const items = []; const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG']; for (const cat of categories) { try { const url = `http://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&maxResults=${config.maxResults || 50}`; const { data } = await fetch(url); const entries = data.split('').slice(1); for (const entry of entries) { const title = entry.match(/([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' '); const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200); const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim(); if (title && link) { items.push({ title, summary: summary || '', url: link, source: 'arXiv', sourceWeight: config.weight || 6, date: new Date().toISOString().split('T')[0] }); } } } catch (err) { console.error(`[arXiv] Error:`, err.message); } } return items; } async function collectHuggingFace(config) { const items = []; try { const { data } = await fetch(`https://huggingface.co/api/daily-papers?limit=${config.maxResults || 30}`); const papers = JSON.parse(data); for (const paper of papers) { items.push({ title: paper.title || paper.paper?.title || 'Untitled', summary: (paper.summary || '').slice(0, 200), url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`, source: 'Hugging Face', sourceWeight: config.weight || 8, date: new Date().toISOString().split('T')[0] }); } } catch (err) { console.error('[HuggingFace] Error:', err.message); } return items; } async function collectGitHub(config) { const items = []; try { const { data } = await fetch('https://github.com/trending?since=daily'); const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || []; for (const repo of repoMatches.slice(0, config.maxResults || 30)) { const href = repo.match(/href="\/([^"]+)"/)?.[1]; const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, ''); const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0'; if (href) { items.push({ title: href, summary: (desc || '').slice(0, 200), url: `https://github.com/${href}`, source: 'GitHub Trending', sourceWeight: config.weight || 7, stars: parseInt(stars.replace(/,/g, '')) || 0, date: new Date().toISOString().split('T')[0] }); } } } catch (err) { console.error('[GitHub] Error:', err.message); } return items; } // ============ 处理模块 ============ function calculateKeywordScore(item, topics) { const text = `${item.title} ${item.summary}`.toLowerCase(); let score = 0; for (const topic of topics) { for (const keyword of topic.keywords) { if (text.includes(keyword.toLowerCase())) score += 2; } } return score; } function deduplicate(items, seenUrls) { return items.filter(item => { const urlKey = item.url.toLowerCase().replace(/\/$/, ''); if (seenUrls.has(urlKey)) return false; seenUrls.add(urlKey); return true; }); } function sortItems(items, topics) { return items.map(item => ({ ...item, finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0) })).sort((a, b) => b.finalScore - a.finalScore); } // ============ 输出模块 ============ function generateMarkdown(items, topCount, topics) { const date = new Date().toLocaleDateString('zh-CN', { year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai' }).replace(/\//g, '-'); const top10 = items.slice(0, topCount); let md = `# AI Daily Brief - ${date}\n\n`; md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`; md += `> 总条目: ${items.length}\n\n`; md += `## 🔥 Top ${topCount} 重要消息\n\n`; for (let i = 0; i < top10.length; i++) { const item = top10[i]; md += `${i + 1}. [${item.title}](${item.url}) - **${item.source}**\n`; if (item.summary) md += ` > ${item.summary.slice(0, 150)}${item.summary.length > 150 ? '...' : ''}\n`; md += '\n'; } md += `## 📂 分类汇总\n\n`; for (const topic of topics) { const topicItems = items.filter(item => { const text = `${item.title} ${item.summary}`.toLowerCase(); return topic.keywords.some(k => text.includes(k.toLowerCase())); }).filter(item => !top10.includes(item)); if (topicItems.length > 0) { md += `### ${topic.name}\n\n`; for (const item of topicItems.slice(0, 10)) { md += `- [${item.title}](${item.url}) - ${item.source}\n`; } md += '\n'; } } md += `---\n*Generated by AINewsCollector*\n`; return { md, date }; } // ============ 主流程 ============ async function main() { console.log('🚀 AINewsCollector 开始运行...\n'); const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8')); let seenUrls = new Set(); if (fs.existsSync(CACHE_PATH)) { const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8')); seenUrls = new Set(cache.urls || []); } let allItems = []; console.log('📡 采集 arXiv...'); const arxivItems = await collectArxiv(config.sources.arxiv); allItems.push(...arxivItems); console.log(` 获取 ${arxivItems.length} 条\n`); console.log('📡 采集 Hugging Face Papers...'); const hfItems = await collectHuggingFace(config.sources.huggingface); allItems.push(...hfItems); console.log(` 获取 ${hfItems.length} 条\n`); console.log('📡 采集 GitHub Trending...'); const ghItems = await collectGitHub(config.sources.github); allItems.push(...ghItems); console.log(` 获取 ${ghItems.length} 条\n`); const originalCount = allItems.length; allItems = deduplicate(allItems, seenUrls); console.log(`🔄 去重: ${originalCount} → ${allItems.length}\n`); allItems = sortItems(allItems, config.topics); const { md, date } = generateMarkdown(allItems, config.output.topCount, config.topics); if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true }); const outputPath = path.join(DAILY_DIR, `${date}.md`); fs.writeFileSync(outputPath, md); console.log(`📝 简报已保存: ${outputPath}\n`); const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) }; fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2)); console.log('✅ 采集完成!\n'); return { success: true, outputPath, date, itemCount: allItems.length, content: md }; } if (require.main === module) { main().catch(err => { console.error('❌ Error:', err); process.exit(1); }); } module.exports = { main };