bojunc 28c290966f feat: 中文版添加自动翻译功能
- 使用 Google Translate API 翻译标题和摘要
- 添加翻译缓存避免重复翻译
- 中文版内容全部为中文(除链接外)
2026-02-27 23:43:18 +08:00

381 lines
12 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* AINewsCollector - AI 新闻采集核心脚本
*/
const fs = require('fs');
const path = require('path');
const { execSync } = require('child_process');
// 配置
const CONFIG_PATH = path.join(__dirname, '../../config.json');
const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json');
const DAILY_DIR = path.join(__dirname, '../../daily');
// 代理配置
const PROXY_URL = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || 'http://127.0.0.1:7890';
// 翻译缓存
const translateCache = new Map();
// 使用 Google Translate API 翻译文本
function translateToChinese(text) {
if (!text || text.length === 0) return text;
// 检查缓存
if (translateCache.has(text)) {
return translateCache.get(text);
}
try {
const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
const encodedText = encodeURIComponent(text.slice(0, 500)); // 限制长度
const url = `https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t&q=${encodedText}`;
const result = execSync(
`curl -s ${proxyFlag} -L --max-time 10 "${url}"`,
{ encoding: 'utf8', timeout: 15000 }
);
const json = JSON.parse(result);
// 解析翻译结果
let translated = '';
if (Array.isArray(json) && Array.isArray(json[0])) {
for (const part of json[0]) {
if (part && part[0]) translated += part[0];
}
}
const finalText = translated || text;
translateCache.set(text, finalText);
return finalText;
} catch (err) {
// 翻译失败,返回原文
return text;
}
}
// 使用 curl 子进程请求(稳定支持代理)
function fetch(url) {
try {
const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : '';
const data = execSync(
`curl -s ${proxyFlag} -L --max-time 30 -H "User-Agent: Mozilla/5.0 (compatible; AINewsCollector/1.0)" "${url}"`,
{ encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 }
);
return { data, status: 200 };
} catch (err) {
throw new Error(`Fetch failed: ${err.message}`);
}
}
// ============ 数据采集器 ============
async function collectArxiv(config) {
const items = [];
const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG'];
// 计算48小时前的日期用于过滤
const now = new Date();
const twoDaysAgo = new Date(now.getTime() - 48 * 60 * 60 * 1000);
for (const cat of categories) {
try {
// 使用 lastUpdatedDate 排序获取最新论文,然后在代码里过滤
const url = `https://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&max_results=${config.maxResults || 50}`;
console.log(` 查询: ${cat}`);
const { data } = await fetch(url);
const entries = data.split('<entry>').slice(1);
for (const entry of entries) {
const title = entry.match(/<title>([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' ');
const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200);
const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim();
const updated = entry.match(/<updated>([\s\S]*?)<\/updated>/)?.[1]?.trim();
const published = entry.match(/<published>([\s\S]*?)<\/published>/)?.[1]?.trim();
if (title && link) {
const publishedDate = published ? new Date(published) : null;
// 只保留最近48小时内发布的论文
if (publishedDate && publishedDate >= twoDaysAgo) {
items.push({
title,
summary: summary || '',
url: link,
source: 'arXiv',
sourceWeight: config.weight || 6,
date: published ? published.split('T')[0] : new Date().toISOString().split('T')[0]
});
}
}
}
} catch (err) {
console.error(`[arXiv] Error for ${cat}:`, err.message);
}
}
return items;
}
async function collectHuggingFace(config) {
const items = [];
try {
// 使用正确的 API 端点daily_papers下划线不需要认证
const { data } = await fetch(`https://huggingface.co/api/daily_papers?limit=${config.maxResults || 30}`);
const papers = JSON.parse(data);
if (!Array.isArray(papers)) {
console.error('[HuggingFace] Error: API返回的不是数组');
return items;
}
for (const paper of papers) {
items.push({
title: paper.title || paper.paper?.title || 'Untitled',
summary: (paper.summary || '').slice(0, 200),
url: `https://huggingface.co/papers/${paper.paper?.id || paper.id}`,
source: 'Hugging Face',
sourceWeight: config.weight || 8,
date: new Date().toISOString().split('T')[0]
});
}
} catch (err) {
console.error('[HuggingFace] Error:', err.message);
}
return items;
}
async function collectGitHub(config) {
const items = [];
try {
const { data } = await fetch('https://github.com/trending?since=daily');
const repoMatches = data.match(/<article class="Box-row">[\s\S]*?<\/article>/g) || [];
for (const repo of repoMatches.slice(0, config.maxResults || 30)) {
const href = repo.match(/href="\/([^"]+)"/)?.[1];
const desc = repo.match(/<p class="col-9[^"]*">([\s\S]*?)<\/p>/)?.[1]?.trim().replace(/<[^>]+>/g, '');
const stars = repo.match(/([\d,]+)\s*stars today/)?.[1] || '0';
if (href) {
items.push({
title: href,
summary: (desc || '').slice(0, 200),
url: `https://github.com/${href}`,
source: 'GitHub Trending',
sourceWeight: config.weight || 7,
stars: parseInt(stars.replace(/,/g, '')) || 0,
date: new Date().toISOString().split('T')[0]
});
}
}
} catch (err) {
console.error('[GitHub] Error:', err.message);
}
return items;
}
// ============ 处理模块 ============
function calculateKeywordScore(item, topics) {
const text = `${item.title} ${item.summary}`.toLowerCase();
let score = 0;
for (const topic of topics) {
for (const keyword of topic.keywords) {
if (text.includes(keyword.toLowerCase())) score += 2;
}
}
return score;
}
function deduplicate(items, seenUrls) {
return items.filter(item => {
const urlKey = item.url.toLowerCase().replace(/\/$/, '');
if (seenUrls.has(urlKey)) return false;
seenUrls.add(urlKey);
return true;
});
}
function sortItems(items, topics) {
return items.map(item => ({
...item,
finalScore: item.sourceWeight + calculateKeywordScore(item, topics) + (item.stars > 100 ? 2 : 0)
})).sort((a, b) => b.finalScore - a.finalScore);
}
// ============ 输出模块 ============
function generateMarkdownZH(items, topCount, topics, date) {
const top10 = items.slice(0, topCount);
let md = `# AI Daily Brief - ${date}\n\n`;
md += `> 采集时间: ${new Date().toLocaleString('zh-CN', { timeZone: 'Asia/Shanghai' })}\n`;
md += `> 总条目: ${items.length}\n\n`;
md += `## 🔥 Top ${topCount} 重要消息\n\n`;
for (let i = 0; i < top10.length; i++) {
const item = top10[i];
const titleZH = translateToChinese(item.title);
const summaryZH = item.summary ? translateToChinese(item.summary.slice(0, 200)) : '';
md += `${i + 1}. [${titleZH}](${item.url}) - **${item.source}**\n`;
if (summaryZH) md += ` > ${summaryZH.slice(0, 150)}${summaryZH.length > 150 ? '...' : ''}\n`;
md += '\n';
}
md += `## 📂 分类汇总\n\n`;
for (const topic of topics) {
const topicItems = items.filter(item => {
const text = `${item.title} ${item.summary}`.toLowerCase();
return topic.keywords.some(k => text.includes(k.toLowerCase()));
}).filter(item => !top10.includes(item));
if (topicItems.length > 0) {
md += `### ${topic.name}\n\n`;
for (const item of topicItems.slice(0, 10)) {
const titleZH = translateToChinese(item.title);
md += `- [${titleZH}](${item.url}) - ${item.source}\n`;
}
md += '\n';
}
}
md += `---\n*Generated by AINewsCollector*\n`;
return md;
}
function generateMarkdownEN(items, topCount, topics, date) {
const top10 = items.slice(0, topCount);
let md = `# AI Daily Brief - ${date}\n\n`;
md += `> Collected at: ${new Date().toLocaleString('en-US', { timeZone: 'Asia/Shanghai' })}\n`;
md += `> Total items: ${items.length}\n\n`;
md += `## 🔥 Top ${topCount} Highlights\n\n`;
for (let i = 0; i < top10.length; i++) {
const item = top10[i];
md += `${i + 1}. [${item.title}](${item.url}) - **${item.source}**\n`;
if (item.summary) md += ` > ${item.summary.slice(0, 150)}${item.summary.length > 150 ? '...' : ''}\n`;
md += '\n';
}
md += `## 📂 Categories\n\n`;
// 英文分类名称映射
const topicNamesEN = {
'AI 编程工具 / Code Agent': 'AI Coding Tools / Code Agent',
'Agent 框架': 'Agent Frameworks',
'AI 基础设施 / 推理优化': 'AI Infrastructure / Inference Optimization'
};
for (const topic of topics) {
const topicItems = items.filter(item => {
const text = `${item.title} ${item.summary}`.toLowerCase();
return topic.keywords.some(k => text.includes(k.toLowerCase()));
}).filter(item => !top10.includes(item));
if (topicItems.length > 0) {
const topicNameEN = topicNamesEN[topic.name] || topic.name;
md += `### ${topicNameEN}\n\n`;
for (const item of topicItems.slice(0, 10)) {
md += `- [${item.title}](${item.url}) - ${item.source}\n`;
}
md += '\n';
}
}
md += `---\n*Generated by AINewsCollector*\n`;
return md;
}
function generateMarkdown(items, topCount, topics) {
const date = new Date().toLocaleDateString('zh-CN', {
year: 'numeric', month: '2-digit', day: '2-digit', timeZone: 'Asia/Shanghai'
}).replace(/\//g, '-');
console.log('🌐 生成中文版(翻译中)...');
const md_zh = generateMarkdownZH(items, topCount, topics, date);
console.log('🌐 生成英文版...');
const md_en = generateMarkdownEN(items, topCount, topics, date);
return { md_zh, md_en, date };
}
// ============ 主流程 ============
async function main() {
console.log('🚀 AINewsCollector 开始运行...\n');
const config = JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf8'));
let seenUrls = new Set();
if (fs.existsSync(CACHE_PATH)) {
const cache = JSON.parse(fs.readFileSync(CACHE_PATH, 'utf8'));
seenUrls = new Set(cache.urls || []);
}
let allItems = [];
console.log('📡 采集 arXiv...');
const arxivItems = await collectArxiv(config.sources.arxiv);
allItems.push(...arxivItems);
console.log(` 获取 ${arxivItems.length}\n`);
console.log('📡 采集 Hugging Face Papers...');
const hfItems = await collectHuggingFace(config.sources.huggingface);
allItems.push(...hfItems);
console.log(` 获取 ${hfItems.length}\n`);
console.log('📡 采集 GitHub Trending...');
const ghItems = await collectGitHub(config.sources.github);
allItems.push(...ghItems);
console.log(` 获取 ${ghItems.length}\n`);
const originalCount = allItems.length;
allItems = deduplicate(allItems, seenUrls);
console.log(`🔄 去重: ${originalCount}${allItems.length}\n`);
allItems = sortItems(allItems, config.topics);
const { md_zh, md_en, date } = generateMarkdown(allItems, config.output.topCount, config.topics);
if (!fs.existsSync(DAILY_DIR)) fs.mkdirSync(DAILY_DIR, { recursive: true });
// 保存中文版
const outputPathZH = path.join(DAILY_DIR, `${date}_zh.md`);
fs.writeFileSync(outputPathZH, md_zh);
console.log(`📝 中文简报: ${outputPathZH}\n`);
// 保存英文版
const outputPathEN = path.join(DAILY_DIR, `${date}_en.md`);
fs.writeFileSync(outputPathEN, md_en);
console.log(`📝 英文简报: ${outputPathEN}\n`);
const cacheData = { lastUpdate: new Date().toISOString(), urls: Array.from(seenUrls).slice(-5000) };
fs.writeFileSync(CACHE_PATH, JSON.stringify(cacheData, null, 2));
console.log('✅ 采集完成!\n');
return {
success: true,
outputPathZH,
outputPathEN,
date,
itemCount: allItems.length,
content_zh: md_zh,
content_en: md_en
};
}
if (require.main === module) {
main().catch(err => { console.error('❌ Error:', err); process.exit(1); });
}
module.exports = { main };