- 替换 Google Translate 为智谱 GLM-4-Flash API - LLM 总结论文/项目核心内容生成中文标题 - 标题简洁有力,突出技术亮点 - 简化输出格式:标题 + 链接(无摘要) - 添加翻译进度显示
416 lines
13 KiB
JavaScript
416 lines
13 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* AINewsCollector - AI 新闻采集核心脚本
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
const { execSync } = require('child_process');
|
||
|
||
// 配置
|
||
const CONFIG_PATH = path.join(__dirname, '../../config.json');
|
||
const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json');
|
||
const DAILY_DIR = path.join(__dirname, '../../daily');
|
||
|
||
// 代理配置
|
||
const PROXY_URL = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || 'http://127.0.0.1:7890';
|
||
|
||
// 智谱 AI API 配置
|
||
const ZHIPU_API = 'https://open.bigmodel.cn/api/paas/v4/chat/completions';
|
||
const ZHIPU_KEY = process.env.ZHIPU_KEY || '64536e2512184e36afaa08a057f6879c.o7hCohyniLdPF2Xn';
|
||
const ZHIPU_MODEL = 'glm-4-flash';
|
||
|
||
// 翻译缓存
|
||
const translateCache = new Map();
|
||
|
||
// 使用 LLM 总结并翻译为中文标题
|
||
function summarizeToChinese(title, summary) {
|
||
const cacheKey = `${title}|||${summary}`;
|
||
if (translateCache.has(cacheKey)) {
|
||
return translateCache.get(cacheKey);
|
||
}
|
||
|
||
try {
|
||
const prompt = `请将以下 AI 论文/项目信息总结为一句话中文标题(30字以内,突出核心贡献或创新点):
|
||
|
||
标题:${title}
|
||
摘要:${summary || '无'}
|
||
|
||
要求:
|
||
1. 只输出翻译后的标题,不要其他内容
|
||
2. 标题要简洁有力,突出技术亮点
|
||
3. 使用专业术语的中文译名`;
|
||
|
||
const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
|
||
const requestBody = JSON.stringify({
|
||
model: ZHIPU_MODEL,
|
||
messages: [
|
||
{ role: 'system', content: '你是一个AI技术专家,擅长总结论文和项目核心内容。' },
|
||
{ role: 'user', content: prompt }
|
||
],
|
||
max_tokens: 100,
|
||
temperature: 0.3
|
||
});
|
||
|
||
// 写入临时文件避免命令行转义问题
|
||
const tmpFile = `/tmp/zhipu_request_${Date.now()}.json`;
|
||
fs.writeFileSync(tmpFile, requestBody);
|
||
|
||
const result = execSync(
|
||
`curl -s ${proxyFlag} -X POST "${ZHIPU_API}" -H "Content-Type: application/json" -H "Authorization: Bearer ${ZHIPU_KEY}" -d @${tmpFile}`,
|
||
{ encoding: 'utf8', timeout: 30000, maxBuffer: 1024 * 1024 }
|
||
);
|
||
|
||
// 清理临时文件
|
||
try { fs.unlinkSync(tmpFile); } catch (e) {}
|
||
|
||
const json = JSON.parse(result);
|
||
let translated = json.choices?.[0]?.message?.content?.trim() || title;
|
||
|
||
// 清理可能的多余内容
|
||
translated = translated.split('\n')[0].trim();
|
||
if (translated.length > 60) {
|
||
translated = translated.slice(0, 57) + '...';
|
||
}
|
||
|
||
translateCache.set(cacheKey, translated);
|
||
return translated;
|
||
} catch (err) {
|
||
// 翻译失败,返回原标题
|
||
return title;
|
||
}
|
||
}
|
||
|
||
// 使用 curl 子进程请求(稳定支持代理)
|
||
function fetch(url) {
|
||
try {
|
||
const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
|
||
const data = execSync(
|
||
`curl -s ${proxyFlag} -L --max-time 30 -H "User-Agent: Mozilla/5.0 (compatible; AINewsCollector/1.0)" "${url}"`,
|
||
{ encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 }
|
||
);
|
||
return { data, status: 200 };
|
||
} catch (err) {
|
||
throw new Error(`Fetch failed: ${err.message}`);
|
||
}
|
||
}
|
||
|
||
// ============ 数据采集器 ============
|
||
|
||
async function collectArxiv(config) {
|
||
const items = [];
|
||
const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG'];
|
||
|
||
// 计算48小时前的日期(用于过滤)
|
||
const now = new Date();
|
||
const twoDaysAgo = new Date(now.getTime() - 48 * 60 * 60 * 1000);
|
||
|
||
for (const cat of categories) {
|
||
try {
|
||
// 使用 lastUpdatedDate 排序获取最新论文,然后在代码里过滤
|
||
const url = `https://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&max_results=${config.maxResults || 50}`;
|
||
|
||
console.log(` 查询: ${cat}`);
|
||
const { data } = await fetch(url);
|
||
|
||
const entries = data.split('<entry>').slice(1);
|
||
|
||
for (const entry of entries) {
|
||
const title = entry.match(/<title>([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' ');
|
||
const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200);
|
||
const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim();
|
||
const updated = entry.match(/<updated>([\s\S]*?)<\/updated>/)?.[1]?.trim();
|
||
const published = entry.match(/<published>([\s\S]*?)<\/published>/)?.[1]?.trim();
|
||
|
||
if (title && link) {
|
||
const publishedDate = published ? new Date(published) : null;
|
||
|
||
// 只保留最近48小时内发布的论文
|
||
if (publishedDate && publishedDate >= twoDaysAgo) {
|
||
items.push({
|
||
title,
|
||
summary: summary || '',
|
||
url: link,
|
||
source: 'arXiv',
|
||
sourceWeight: config.weight || 6,
|
||
date: published ? published.split('T')[0] : new Date().toISOString().split('T')[0]
|
||
});
|
||
}
|
||
}
|
||
}
|
||
} catch (err) {
|
||
console.error(`[arXiv] Error for ${cat}:`, err.message);
|
||
}
|
||
}
|
||
return items;
|
||
}
|
||
|
||
async function collectHuggingFace(config) {
|
||
const items = [];
|
||
try {
|
||
// 使用正确的 API 端点:daily_papers(下划线),不需要认证
|
||
const { data } = await fetch(`https://huggingface.co/api/daily_papers?limit=${config.maxResults || 30}`);
|
||
const papers = JSON.parse(data);
|
||
|
||
if (!Array.isArray(papers)) {
|
||
console.error('[HuggingFace] Error: API返回的不是数组');
|
||
return items;
|
||
}
|
||
|
||
for (const paper of papers) {
|
||
items.push({
|
||
title: paper.title || paper.paper?.title || 'Untitled',
|
||
summary: (paper.summary || '').slice(0, 200),
|
||
url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`,
|
||
source: 'Hugging Face',
|
||
sourceWeight: config.weight || 8,
|
||
date: new Date().toISOString().split('T')[0]
|
||
});
|
||
}
|
||
} catch (err) {
|
||
console.error('[HuggingFace] Error:', err.message);
|
||
}
|
||
return items;
|
||
}
|
||
|
||
async function collectGitHub(config) {
|
||
const items = [];
|
||
try {
|
||
const { data } = await fetch('https://github.com/trending?since=daily');
|
||
const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || [];
|
||
|
||
for (const repo of repoMatches.slice(0, config.maxResults || 30)) {
|
||
const href = repo.match(/href="\/([^"]+)"/)?.[1];
|
||
const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, '');
|
||
const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0';
|
||
|
||
if (href) {
|
||
items.push({
|
||
title: href,
|
||
summary: (desc || '').slice(0, 200),
|
||
url: `https://github.com/${href}`,
|
||
source: 'GitHub Trending',
|
||
sourceWeight: config.weight || 7,
|
||
stars: parseInt(stars.replace(/,/g, '')) || 0,
|
||
date: new Date().toISOString().split('T')[0]
|
||
});
|
||
}
|
||
}
|
||
} catch (err) {
|
||
console.error('[GitHub] Error:', err.message);
|
||
}
|
||
return items;
|
||
}
|
||
|
||
// ============ 处理模块 ============
|
||
|
||
function calculateKeywordScore(item, topics) {
|
||
const text = `${item.title} ${item.summary}`.toLowerCase();
|
||
let score = 0;
|
||
for (const topic of topics) {
|
||
for (const keyword of topic.keywords) {
|
||
if (text.includes(keyword.toLowerCase())) score += 2;
|
||
}
|
||
}
|
||
return score;
|
||
}
|
||
|
||
function deduplicate(items, seenUrls) {
|
||
return items.filter(item => {
|
||
const urlKey = item.url.toLowerCase().replace(/\/$/, '');
|
||
if (seenUrls.has(urlKey)) return false;
|
||
seenUrls.add(urlKey);
|
||
return true;
|
||
});
|
||
}
|
||
|
||
function sortItems(items, topics) {
|
||
return items.map(item => ({
|
||
...item,
|
||
finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0)
|
||
})).sort((a, b) => b.finalScore - a.finalScore);
|
||
}
|
||
|
||
// ============ 输出模块 ============
|
||
|
||
function generateMarkdownZH(items, topCount, topics, date) {
|
||
const top10 = items.slice(0, topCount);
|
||
|
||
let md = `# AI Daily Brief - ${date}\n\n`;
|
||
md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`;
|
||
md += `> 总条目: ${items.length}\n\n`;
|
||
|
||
md += `## 🔥 Top ${topCount} 重要消息\n\n`;
|
||
console.log(` 翻译 Top ${topCount}...`);
|
||
for (let i = 0; i < top10.length; i++) {
|
||
const item = top10[i];
|
||
process.stdout.write(` [${i + 1}/${top10.length}] `);
|
||
const titleZH = summarizeToChinese(item.title, item.summary);
|
||
md += `${i + 1}. ${titleZH}\n ${item.url}\n\n`;
|
||
}
|
||
|
||
md += `## 📂 分类汇总\n\n`;
|
||
|
||
let totalCategoryItems = 0;
|
||
for (const topic of topics) {
|
||
const topicItems = items.filter(item => {
|
||
const text = `${item.title} ${item.summary}`.toLowerCase();
|
||
return topic.keywords.some(k => text.includes(k.toLowerCase()));
|
||
}).filter(item => !top10.includes(item));
|
||
totalCategoryItems += topicItems.length;
|
||
}
|
||
|
||
let processedCount = 0;
|
||
for (const topic of topics) {
|
||
const topicItems = items.filter(item => {
|
||
const text = `${item.title} ${item.summary}`.toLowerCase();
|
||
return topic.keywords.some(k => text.includes(k.toLowerCase()));
|
||
}).filter(item => !top10.includes(item));
|
||
|
||
if (topicItems.length > 0) {
|
||
md += `### ${topic.name}\n\n`;
|
||
for (const item of topicItems.slice(0, 10)) {
|
||
processedCount++;
|
||
process.stdout.write(` [${processedCount}/${totalCategoryItems}] `);
|
||
const titleZH = summarizeToChinese(item.title, item.summary);
|
||
md += `- ${titleZH}\n ${item.url}\n`;
|
||
}
|
||
md += '\n';
|
||
}
|
||
}
|
||
|
||
md += `---\n*Generated by AINewsCollector*\n`;
|
||
|
||
return md;
|
||
}
|
||
|
||
function generateMarkdownEN(items, topCount, topics, date) {
|
||
const top10 = items.slice(0, topCount);
|
||
|
||
let md = `# AI Daily Brief - ${date}\n\n`;
|
||
md += `> Collected at: ${new Date().toLocaleString('en-US', { timeZone: 'Asia/Shanghai' })}\n`;
|
||
md += `> Total items: ${items.length}\n\n`;
|
||
|
||
md += `## 🔥 Top ${topCount} Highlights\n\n`;
|
||
for (let i = 0; i < top10.length; i++) {
|
||
const item = top10[i];
|
||
md += `${i + 1}. ${item.title}\n ${item.url}\n\n`;
|
||
}
|
||
|
||
md += `## 📂 Categories\n\n`;
|
||
|
||
// 英文分类名称映射
|
||
const topicNamesEN = {
|
||
'AI 编程工具 / Code Agent': 'AI Coding Tools / Code Agent',
|
||
'Agent 框架': 'Agent Frameworks',
|
||
'AI 基础设施 / 推理优化': 'AI Infrastructure / Inference Optimization'
|
||
};
|
||
|
||
for (const topic of topics) {
|
||
const topicItems = items.filter(item => {
|
||
const text = `${item.title} ${item.summary}`.toLowerCase();
|
||
return topic.keywords.some(k => text.includes(k.toLowerCase()));
|
||
}).filter(item => !top10.includes(item));
|
||
|
||
if (topicItems.length > 0) {
|
||
const topicNameEN = topicNamesEN[topic.name] || topic.name;
|
||
md += `### ${topicNameEN}\n\n`;
|
||
for (const item of topicItems.slice(0, 10)) {
|
||
md += `- ${item.title}\n ${item.url}\n`;
|
||
}
|
||
md += '\n';
|
||
}
|
||
}
|
||
|
||
md += `---\n*Generated by AINewsCollector*\n`;
|
||
|
||
return md;
|
||
}
|
||
|
||
function generateMarkdown(items, topCount, topics) {
|
||
const date = new Date().toLocaleDateString('zh-CN', {
|
||
year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai'
|
||
}).replace(/\//g, '-');
|
||
|
||
console.log('🌐 生成中文版(翻译中)...');
|
||
const md_zh = generateMarkdownZH(items, topCount, topics, date);
|
||
|
||
console.log('🌐 生成英文版...');
|
||
const md_en = generateMarkdownEN(items, topCount, topics, date);
|
||
|
||
return { md_zh, md_en, date };
|
||
}
|
||
|
||
// ============ 主流程 ============
|
||
|
||
async function main() {
|
||
console.log('🚀 AINewsCollector 开始运行...\n');
|
||
|
||
const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
|
||
|
||
let seenUrls = new Set();
|
||
if (fs.existsSync(CACHE_PATH)) {
|
||
const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
|
||
seenUrls = new Set(cache.urls || []);
|
||
}
|
||
|
||
let allItems = [];
|
||
|
||
console.log('📡 采集 arXiv...');
|
||
const arxivItems = await collectArxiv(config.sources.arxiv);
|
||
allItems.push(...arxivItems);
|
||
console.log(` 获取 ${arxivItems.length} 条\n`);
|
||
|
||
console.log('📡 采集 Hugging Face Papers...');
|
||
const hfItems = await collectHuggingFace(config.sources.huggingface);
|
||
allItems.push(...hfItems);
|
||
console.log(` 获取 ${hfItems.length} 条\n`);
|
||
|
||
console.log('📡 采集 GitHub Trending...');
|
||
const ghItems = await collectGitHub(config.sources.github);
|
||
allItems.push(...ghItems);
|
||
console.log(` 获取 ${ghItems.length} 条\n`);
|
||
|
||
const originalCount = allItems.length;
|
||
allItems = deduplicate(allItems, seenUrls);
|
||
console.log(`🔄 去重: ${originalCount} → ${allItems.length}\n`);
|
||
|
||
allItems = sortItems(allItems, config.topics);
|
||
|
||
const { md_zh, md_en, date } = generateMarkdown(allItems, config.output.topCount, config.topics);
|
||
|
||
if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true });
|
||
|
||
// 保存中文版
|
||
const outputPathZH = path.join(DAILY_DIR, `${date}_zh.md`);
|
||
fs.writeFileSync(outputPathZH, md_zh);
|
||
console.log(`📝 中文简报: ${outputPathZH}\n`);
|
||
|
||
// 保存英文版
|
||
const outputPathEN = path.join(DAILY_DIR, `${date}_en.md`);
|
||
fs.writeFileSync(outputPathEN, md_en);
|
||
console.log(`📝 英文简报: ${outputPathEN}\n`);
|
||
|
||
const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) };
|
||
fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2));
|
||
|
||
console.log('✅ 采集完成!\n');
|
||
|
||
return {
|
||
success: true,
|
||
outputPathZH,
|
||
outputPathEN,
|
||
date,
|
||
itemCount: allItems.length,
|
||
content_zh: md_zh,
|
||
content_en: md_en
|
||
};
|
||
}
|
||
|
||
if (require.main === module) {
|
||
main().catch(err => { console.error('❌ Error:', err); process.exit(1); });
|
||
}
|
||
|
||
module.exports = { main };
|