bojunc 37bd458c0b feat: 使用智谱GLM-4-Flash生成中文标题
- 替换 Google Translate 为智谱 GLM-4-Flash API
- LLM 总结论文/项目核心内容生成中文标题
- 标题简洁有力,突出技术亮点
- 简化输出格式:标题 + 链接(无摘要)
- 添加翻译进度显示
2026-02-28 00:09:07 +08:00

416 lines
13 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* AINewsCollector - AI 新闻采集核心脚本
*/
const fs = require('fs');
const path = require('path');
const { execSync } = require('child_process');
// 配置
const CONFIG_PATH = path.join(__dirname, '../../config.json');
const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json');
const DAILY_DIR = path.join(__dirname, '../../daily');
// 代理配置
const PROXY_URL = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || 'http://127.0.0.1:7890';
// 智谱 AI API 配置
const ZHIPU_API = 'https://open.bigmodel.cn/api/paas/v4/chat/completions';
const ZHIPU_KEY = process.env.ZHIPU_KEY || '64536e2512184e36afaa08a057f6879c.o7hCohyniLdPF2Xn';
const ZHIPU_MODEL = 'glm-4-flash';
// 翻译缓存
const translateCache = new Map();
// 使用 LLM 总结并翻译为中文标题
function summarizeToChinese(title, summary) {
const cacheKey = `${title}|||${summary}`;
if (translateCache.has(cacheKey)) {
return translateCache.get(cacheKey);
}
try {
const prompt = `请将以下 AI 论文/项目信息总结为一句话中文标题30字以内突出核心贡献或创新点
标题:${title}
摘要:${summary || '无'}
要求:
1. 只输出翻译后的标题,不要其他内容
2. 标题要简洁有力,突出技术亮点
3. 使用专业术语的中文译名`;
const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
const requestBody = JSON.stringify({
model: ZHIPU_MODEL,
messages: [
{ role: 'system', content: '你是一个AI技术专家擅长总结论文和项目核心内容。' },
{ role: 'user', content: prompt }
],
max_tokens: 100,
temperature: 0.3
});
// 写入临时文件避免命令行转义问题
const tmpFile = `/tmp/zhipu_request_${Date.now()}.json`;
fs.writeFileSync(tmpFile, requestBody);
const result = execSync(
`curl -s ${proxyFlag} -X POST "${ZHIPU_API}" -H "Content-Type: application/json" -H "Authorization: Bearer ${ZHIPU_KEY}" -d @${tmpFile}`,
{ encoding: 'utf8', timeout: 30000, maxBuffer: 1024 * 1024 }
);
// 清理临时文件
try { fs.unlinkSync(tmpFile); } catch (e) {}
const json = JSON.parse(result);
let translated = json.choices?.[0]?.message?.content?.trim() || title;
// 清理可能的多余内容
translated = translated.split('\n')[0].trim();
if (translated.length > 60) {
translated = translated.slice(0, 57) + '...';
}
translateCache.set(cacheKey, translated);
return translated;
} catch (err) {
// 翻译失败,返回原标题
return title;
}
}
// 使用 curl 子进程请求(稳定支持代理)
function fetch(url) {
try {
const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
const data = execSync(
`curl -s ${proxyFlag} -L --max-time 30 -H "User-Agent: Mozilla/5.0 (compatible; AINewsCollector/1.0)" "${url}"`,
{ encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 }
);
return { data, status: 200 };
} catch (err) {
throw new Error(`Fetch failed: ${err.message}`);
}
}
// ============ 数据采集器 ============
async function collectArxiv(config) {
const items = [];
const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG'];
// 计算48小时前的日期用于过滤
const now = new Date();
const twoDaysAgo = new Date(now.getTime() - 48 * 60 * 60 * 1000);
for (const cat of categories) {
try {
// 使用 lastUpdatedDate 排序获取最新论文,然后在代码里过滤
const url = `https://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&max_results=${config.maxResults || 50}`;
console.log(` 查询: ${cat}`);
const { data } = await fetch(url);
const entries = data.split('<entry>').slice(1);
for (const entry of entries) {
const title = entry.match(/<title>([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' ');
const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200);
const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim();
const updated = entry.match(/<updated>([\s\S]*?)<\/updated>/)?.[1]?.trim();
const published = entry.match(/<published>([\s\S]*?)<\/published>/)?.[1]?.trim();
if (title && link) {
const publishedDate = published ? new Date(published) : null;
// 只保留最近48小时内发布的论文
if (publishedDate && publishedDate >= twoDaysAgo) {
items.push({
title,
summary: summary || '',
url: link,
source: 'arXiv',
sourceWeight: config.weight || 6,
date: published ? published.split('T')[0] : new Date().toISOString().split('T')[0]
});
}
}
}
} catch (err) {
console.error(`[arXiv] Error for ${cat}:`, err.message);
}
}
return items;
}
async function collectHuggingFace(config) {
const items = [];
try {
// 使用正确的 API 端点daily_papers下划线不需要认证
const { data } = await fetch(`https://huggingface.co/api/daily_papers?limit=${config.maxResults || 30}`);
const papers = JSON.parse(data);
if (!Array.isArray(papers)) {
console.error('[HuggingFace] Error: API返回的不是数组');
return items;
}
for (const paper of papers) {
items.push({
title: paper.title || paper.paper?.title || 'Untitled',
summary: (paper.summary || '').slice(0, 200),
url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`,
source: 'Hugging Face',
sourceWeight: config.weight || 8,
date: new Date().toISOString().split('T')[0]
});
}
} catch (err) {
console.error('[HuggingFace] Error:', err.message);
}
return items;
}
async function collectGitHub(config) {
const items = [];
try {
const { data } = await fetch('https://github.com/trending?since=daily');
const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || [];
for (const repo of repoMatches.slice(0, config.maxResults || 30)) {
const href = repo.match(/href="\/([^"]+)"/)?.[1];
const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, '');
const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0';
if (href) {
items.push({
title: href,
summary: (desc || '').slice(0, 200),
url: `https://github.com/${href}`,
source: 'GitHub Trending',
sourceWeight: config.weight || 7,
stars: parseInt(stars.replace(/,/g, '')) || 0,
date: new Date().toISOString().split('T')[0]
});
}
}
} catch (err) {
console.error('[GitHub] Error:', err.message);
}
return items;
}
// ============ 处理模块 ============
function calculateKeywordScore(item, topics) {
const text = `${item.title} ${item.summary}`.toLowerCase();
let score = 0;
for (const topic of topics) {
for (const keyword of topic.keywords) {
if (text.includes(keyword.toLowerCase())) score += 2;
}
}
return score;
}
function deduplicate(items, seenUrls) {
return items.filter(item => {
const urlKey = item.url.toLowerCase().replace(/\/$/, '');
if (seenUrls.has(urlKey)) return false;
seenUrls.add(urlKey);
return true;
});
}
function sortItems(items, topics) {
return items.map(item => ({
...item,
finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0)
})).sort((a, b) => b.finalScore - a.finalScore);
}
// ============ 输出模块 ============
function generateMarkdownZH(items, topCount, topics, date) {
const top10 = items.slice(0, topCount);
let md = `# AI Daily Brief - ${date}\n\n`;
md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`;
md += `> 总条目: ${items.length}\n\n`;
md += `## 🔥 Top ${topCount} 重要消息\n\n`;
console.log(` 翻译 Top ${topCount}...`);
for (let i = 0; i < top10.length; i++) {
const item = top10[i];
process.stdout.write(` [${i + 1}/${top10.length}] `);
const titleZH = summarizeToChinese(item.title, item.summary);
md += `${i + 1}. ${titleZH}\n ${item.url}\n\n`;
}
md += `## 📂 分类汇总\n\n`;
let totalCategoryItems = 0;
for (const topic of topics) {
const topicItems = items.filter(item => {
const text = `${item.title} ${item.summary}`.toLowerCase();
return topic.keywords.some(k => text.includes(k.toLowerCase()));
}).filter(item => !top10.includes(item));
totalCategoryItems += topicItems.length;
}
let processedCount = 0;
for (const topic of topics) {
const topicItems = items.filter(item => {
const text = `${item.title} ${item.summary}`.toLowerCase();
return topic.keywords.some(k => text.includes(k.toLowerCase()));
}).filter(item => !top10.includes(item));
if (topicItems.length > 0) {
md += `### ${topic.name}\n\n`;
for (const item of topicItems.slice(0, 10)) {
processedCount++;
process.stdout.write(` [${processedCount}/${totalCategoryItems}] `);
const titleZH = summarizeToChinese(item.title, item.summary);
md += `- ${titleZH}\n ${item.url}\n`;
}
md += '\n';
}
}
md += `---\n*Generated by AINewsCollector*\n`;
return md;
}
function generateMarkdownEN(items, topCount, topics, date) {
const top10 = items.slice(0, topCount);
let md = `# AI Daily Brief - ${date}\n\n`;
md += `> Collected at: ${new Date().toLocaleString('en-US', { timeZone: 'Asia/Shanghai' })}\n`;
md += `> Total items: ${items.length}\n\n`;
md += `## 🔥 Top ${topCount} Highlights\n\n`;
for (let i = 0; i < top10.length; i++) {
const item = top10[i];
md += `${i + 1}. ${item.title}\n ${item.url}\n\n`;
}
md += `## 📂 Categories\n\n`;
// 英文分类名称映射
const topicNamesEN = {
'AI 编程工具 / Code Agent': 'AI Coding Tools / Code Agent',
'Agent 框架': 'Agent Frameworks',
'AI 基础设施 / 推理优化': 'AI Infrastructure / Inference Optimization'
};
for (const topic of topics) {
const topicItems = items.filter(item => {
const text = `${item.title} ${item.summary}`.toLowerCase();
return topic.keywords.some(k => text.includes(k.toLowerCase()));
}).filter(item => !top10.includes(item));
if (topicItems.length > 0) {
const topicNameEN = topicNamesEN[topic.name] || topic.name;
md += `### ${topicNameEN}\n\n`;
for (const item of topicItems.slice(0, 10)) {
md += `- ${item.title}\n ${item.url}\n`;
}
md += '\n';
}
}
md += `---\n*Generated by AINewsCollector*\n`;
return md;
}
function generateMarkdown(items, topCount, topics) {
const date = new Date().toLocaleDateString('zh-CN', {
year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai'
}).replace(/\//g, '-');
console.log('🌐 生成中文版(翻译中)...');
const md_zh = generateMarkdownZH(items, topCount, topics, date);
console.log('🌐 生成英文版...');
const md_en = generateMarkdownEN(items, topCount, topics, date);
return { md_zh, md_en, date };
}
// ============ 主流程 ============
async function main() {
console.log('🚀 AINewsCollector 开始运行...\n');
const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
let seenUrls = new Set();
if (fs.existsSync(CACHE_PATH)) {
const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
seenUrls = new Set(cache.urls || []);
}
let allItems = [];
console.log('📡 采集 arXiv...');
const arxivItems = await collectArxiv(config.sources.arxiv);
allItems.push(...arxivItems);
console.log(` 获取 ${arxivItems.length}\n`);
console.log('📡 采集 Hugging Face Papers...');
const hfItems = await collectHuggingFace(config.sources.huggingface);
allItems.push(...hfItems);
console.log(` 获取 ${hfItems.length}\n`);
console.log('📡 采集 GitHub Trending...');
const ghItems = await collectGitHub(config.sources.github);
allItems.push(...ghItems);
console.log(` 获取 ${ghItems.length}\n`);
const originalCount = allItems.length;
allItems = deduplicate(allItems, seenUrls);
console.log(`🔄 去重: ${originalCount}${allItems.length}\n`);
allItems = sortItems(allItems, config.topics);
const { md_zh, md_en, date } = generateMarkdown(allItems, config.output.topCount, config.topics);
if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true });
// 保存中文版
const outputPathZH = path.join(DAILY_DIR, `${date}_zh.md`);
fs.writeFileSync(outputPathZH, md_zh);
console.log(`📝 中文简报: ${outputPathZH}\n`);
// 保存英文版
const outputPathEN = path.join(DAILY_DIR, `${date}_en.md`);
fs.writeFileSync(outputPathEN, md_en);
console.log(`📝 英文简报: ${outputPathEN}\n`);
const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) };
fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2));
console.log('✅ 采集完成!\n');
return {
success: true,
outputPathZH,
outputPathEN,
date,
itemCount: allItems.length,
content_zh: md_zh,
content_en: md_en
};
}
if (require.main === module) {
main().catch(err => { console.error('❌ Error:', err); process.exit(1); });
}
module.exports = { main };