diff --git a/HUGGINGFACE_API_ISSUE.md b/HUGGINGFACE_API_ISSUE.md new file mode 100644 index 0000000..a4d0fe4 --- /dev/null +++ b/HUGGINGFACE_API_ISSUE.md @@ -0,0 +1,83 @@ +# HuggingFace API问题分析 + +## 问题诊断 + +**错误信息:** +``` +HTTP/2 401 +x-error-message: Invalid username or password. +{"error":"Invalid username or password."} +``` + +**原因:** +HuggingFace的 `/api/daily-papers` 端点需要认证才能访问。 + +## 解决方案 + +### 方案1:获取HuggingFace API Token(推荐) + +1. **注册/登录HuggingFace账号** + - 访问:https://huggingface.co/join + - 或登录:https://huggingface.co/login + +2. **获取Access Token** + - 访问:https://huggingface.co/settings/tokens + - 点击 "Create new token" + - 选择 "Read" 权限 + - 复制生成的token + +3. **配置到采集脚本** + - 在collect.js中添加认证头: + ```javascript + const headers = { + 'Authorization': `Bearer ${process.env.HF_TOKEN || 'YOUR_TOKEN_HERE'}` + }; + ``` + +4. **设置环境变量** + ```bash + export HF_TOKEN="hf_xxxxxxxxxxxx" + ``` + +### 方案2:使用HuggingFace Hub库(Python) + +```python +from huggingface_hub import HfApi + +api = HfApi() +papers = api.list_papers(limit=30) +``` + +### 方案3:暂时禁用HuggingFace源 + +在 `config.json` 中: +```json +{ + "sources": { + "huggingface": { + "enabled": false + } + } +} +``` + +## API文档 + +- 官方API文档:https://huggingface.co/docs/hub/api +- OpenAPI规范:https://huggingface.co/.well-known/openapi.json +- 速率限制:所有API调用都受速率限制 + +## 当前状态 + +- ✅ GitHub Trending:正常工作 +- ❌ HuggingFace Papers:需要认证 +- ⚠️ arXiv:需要检查 + +## 建议 + +1. **短期**:暂时禁用HuggingFace源,只使用GitHub +2. **长期**:注册HuggingFace账号并获取token,启用认证访问 + +--- + +*诊断时间: 2026-02-25* diff --git a/README.md b/README.md index f3b7ff7..eeff447 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,78 @@ -# AINewsCollector +# AI News Collector -自动收集并整理 AI 圈最新消息的 Agent。 +自动采集AI相关新闻和趋势的工具。 ## 功能 -- 从 arXiv、Hugging Face Papers、GitHub Trending、技术博客采集信息 -- 基于话题关键词筛选和排序 -- 生成每日简报 (.md) -- 推送至飞书 +- 📡 多源采集:arXiv、HuggingFace Papers、GitHub Trending +- 🔄 智能去重:避免重复内容 +- 📊 权重排序:根据来源和关键词匹配度排序 +- 📝 简报生成:自动生成Markdown格式简报 -## 话题配置 +## 配置 -- AI 编程工具 / Code Agent -- Agent 框架 -- AI 基础设施 / 推理优化 +### 代理设置 -## 调度 - -- 23:00 收集任务 -- 09:00 推送任务 - -## 目录结构 +工具支持HTTP/HTTPS代理,用于访问国外数据源。 +**方式1:环境变量** +```bash +export HTTP_PROXY=http://127.0.0.1:7890 +export HTTPS_PROXY=http://127.0.0.1:7890 ``` -AINewsCollector/ -├── config.yaml # 话题配置 -├── daily/ # 日期归档 -├── cache/ # 去重缓存 -└── skill/ # OpenClaw Skill + +**方式2:修改代码** +在 `skill/ai-news-collector/collect.js` 中修改 `PROXY_HOST` 常量: +```javascript +const PROXY_HOST = 'http://127.0.0.1:7890'; ``` +### 数据源配置 + +编辑 `config.json` 文件: + +```json +{ + "topics": [...], // 主题关键词 + "sources": { + "arxiv": { "enabled": true, ... }, + "huggingface": { "enabled": true, ... }, + "github": { "enabled": true, ... } + } +} +``` + +## 使用 + +### 手动采集 +```bash +cd /Users/chenbj/home/workspace/OpenclawSpace/AINewsCollector +node skill/ai-news-collector/collect.js +``` + +### 定时采集 +已配置cron job,每天自动采集两次: +- 09:00 - 每日推送 +- 23:00 - 每日采集 + +## 输出 + +生成的简报保存在 `daily/` 目录: +- 文件名格式:`YYYY-MM-DD.md` +- 内容包括:Top 10重要消息、分类汇总 + +## 故障排查 + +### 采集为0条 +1. 检查网络连接 +2. 确认代理配置正确 +3. 查看错误日志输出 + +### 代理配置 +如果访问HuggingFace或GitHub超时,需要配置代理: +- 代理地址:`http://127.0.0.1:7890` +- 支持HTTP和HTTPS协议 + --- -创建日期:2026-02-22 +*更新时间: 2026-02-24* diff --git a/cache/seen_urls.json b/cache/seen_urls.json index 26dfeff..7fc0e1b 100644 --- a/cache/seen_urls.json +++ b/cache/seen_urls.json @@ -1,4 +1,136 @@ { - "lastUpdate": "2026-02-22T16:15:49.916Z", - "urls": [] + "lastUpdate": "2026-02-27T15:33:33.096Z", + "urls": [ + "http://arxiv.org/abs/2602.23360v1", + "http://arxiv.org/abs/2602.23359v1", + "http://arxiv.org/abs/2602.23353v1", + "http://arxiv.org/abs/2602.23349v1", + "http://arxiv.org/abs/2602.23335v1", + "http://arxiv.org/abs/2602.23334v1", + "http://arxiv.org/abs/2602.23331v1", + "http://arxiv.org/abs/2602.23330v1", + "http://arxiv.org/abs/2602.23329v1", + "http://arxiv.org/abs/2602.23318v1", + "http://arxiv.org/abs/2602.23315v1", + "http://arxiv.org/abs/2602.23312v1", + "http://arxiv.org/abs/2602.23302v1", + "http://arxiv.org/abs/2602.23296v1", + "http://arxiv.org/abs/2602.23286v1", + "http://arxiv.org/abs/2602.23285v1", + "http://arxiv.org/abs/2602.23276v1", + "http://arxiv.org/abs/2602.23271v1", + "http://arxiv.org/abs/2602.23259v1", + "http://arxiv.org/abs/2602.23258v1", + "http://arxiv.org/abs/2602.23248v1", + "http://arxiv.org/abs/2602.23242v1", + "http://arxiv.org/abs/2602.23239v1", + "http://arxiv.org/abs/2602.23235v1", + "http://arxiv.org/abs/2602.23234v1", + "http://arxiv.org/abs/2602.23232v1", + "http://arxiv.org/abs/2602.23228v1", + "http://arxiv.org/abs/2602.23225v1", + "http://arxiv.org/abs/2602.23203v1", + "http://arxiv.org/abs/2602.23199v1", + "http://arxiv.org/abs/2602.23193v1", + "http://arxiv.org/abs/2602.23172v1", + "http://arxiv.org/abs/2602.23163v1", + "http://arxiv.org/abs/2602.23161v1", + "http://arxiv.org/abs/2602.23153v1", + "http://arxiv.org/abs/2602.23152v1", + "http://arxiv.org/abs/2602.23351v1", + "http://arxiv.org/abs/2602.23300v1", + "http://arxiv.org/abs/2602.23266v1", + "http://arxiv.org/abs/2602.23200v1", + "http://arxiv.org/abs/2602.23197v1", + "http://arxiv.org/abs/2602.23184v1", + "http://arxiv.org/abs/2602.23136v1", + "http://arxiv.org/abs/2602.23079v1", + "http://arxiv.org/abs/2602.23075v1", + "http://arxiv.org/abs/2602.23071v1", + "http://arxiv.org/abs/2602.23070v1", + "http://arxiv.org/abs/2602.23062v1", + "http://arxiv.org/abs/2602.23061v1", + "http://arxiv.org/abs/2602.23057v1", + "http://arxiv.org/abs/2602.22958v1", + "http://arxiv.org/abs/2602.22918v1", + "http://arxiv.org/abs/2602.22911v1", + "http://arxiv.org/abs/2602.22897v1", + "http://arxiv.org/abs/2602.22871v1", + "http://arxiv.org/abs/2602.22868v1", + "http://arxiv.org/abs/2602.22865v1", + "http://arxiv.org/abs/2602.22846v1", + "http://arxiv.org/abs/2602.22831v1", + "http://arxiv.org/abs/2602.22828v1", + "http://arxiv.org/abs/2602.23358v1", + "http://arxiv.org/abs/2602.23341v1", + "http://arxiv.org/abs/2602.23336v1", + "http://arxiv.org/abs/2602.23321v1", + "http://arxiv.org/abs/2602.23320v1", + "http://arxiv.org/abs/2602.23305v1", + "http://arxiv.org/abs/2602.23303v1", + "http://arxiv.org/abs/2602.23295v1", + "http://arxiv.org/abs/2602.23280v1", + "http://arxiv.org/abs/2602.23277v1", + "http://arxiv.org/abs/2602.23219v1", + "http://arxiv.org/abs/2602.23214v1", + "http://arxiv.org/abs/2602.23201v1", + "http://arxiv.org/abs/2602.23192v1", + "http://arxiv.org/abs/2602.23188v1", + "http://arxiv.org/abs/2602.23182v1", + "http://arxiv.org/abs/2602.23179v1", + "http://arxiv.org/abs/2602.23167v1", + "http://arxiv.org/abs/2602.23164v1", + "http://arxiv.org/abs/2602.23159v1", + "http://arxiv.org/abs/2602.23146v1", + "http://arxiv.org/abs/2602.23142v1", + "http://arxiv.org/abs/2602.23135v1", + "http://arxiv.org/abs/2602.23132v1", + "http://arxiv.org/abs/2602.23128v1", + "https://huggingface.co/papers/2602.23339", + "https://huggingface.co/papers/2602.20332", + "https://huggingface.co/papers/2602.20300", + "https://huggingface.co/papers/2602.18253", + "https://huggingface.co/papers/2602.22045", + "https://huggingface.co/papers/2602.22638", + "https://huggingface.co/papers/2602.20981", + "https://huggingface.co/papers/2602.23205", + "https://huggingface.co/papers/2602.23363", + "https://huggingface.co/papers/2602.22675", + "https://huggingface.co/papers/2602.23258", + "https://huggingface.co/papers/2602.22859", + "https://huggingface.co/papers/2602.21760", + "https://huggingface.co/papers/2602.22479", + "https://huggingface.co/papers/2602.17594", + "https://huggingface.co/papers/2602.23008", + "https://huggingface.co/papers/2602.22766", + "https://huggingface.co/papers/2602.22594", + "https://huggingface.co/papers/2602.22437", + "https://huggingface.co/papers/2602.23058", + "https://huggingface.co/papers/2602.22897", + "https://huggingface.co/papers/2602.23165", + "https://huggingface.co/papers/2602.23152", + "https://huggingface.co/papers/2602.23259", + "https://huggingface.co/papers/2602.19424", + "https://huggingface.co/papers/2602.20933", + "https://huggingface.co/papers/2602.21456", + "https://huggingface.co/papers/2602.21374", + "https://huggingface.co/papers/2602.16729", + "https://huggingface.co/papers/2602.20273", + "https://github.com/login?return_to=%2fruvnet%2fwifi-densepose", + "https://github.com/login?return_to=%2fbytedance%2fdeer-flow", + "https://github.com/login?return_to=%2fmoonshine-ai%2fmoonshine", + "https://github.com/sponsors/muratcankoylan", + "https://github.com/sponsors/obra", + "https://github.com/login?return_to=%2fruvnet%2fclaude-flow", + "https://github.com/login?return_to=%2fdatawhalechina%2fhello-agents", + "https://github.com/sponsors/abhigyanpatwari", + "https://github.com/login?return_to=%2fmoeru-ai%2fairi", + "https://github.com/login?return_to=%2fanthropics%2fclaude-code", + "https://github.com/login?return_to=%2fruvnet%2fruvector", + "https://github.com/login?return_to=%2fwei-shaw%2fclaude-relay-service", + "https://github.com/login?return_to=%2ftukaani-project%2fxz", + "https://github.com/sponsors/d4vinci", + "https://github.com/sponsors/steipete", + "https://github.com/login?return_to=%2falibaba%2fopensandbox" + ] } \ No newline at end of file diff --git a/daily/2026-02-27.md b/daily/2026-02-27.md new file mode 100644 index 0000000..73a6e37 --- /dev/null +++ b/daily/2026-02-27.md @@ -0,0 +1,57 @@ +# AI Daily Brief - 2026-02-27 + +> 采集时间: 2026/2/27 23:33:33 +> 总条目: 131 + +## 🔥 Top 10 重要消息 + +1. [sponsors/muratcankoylan](https://github.com/sponsors/muratcankoylan) - **GitHub Trending** + > A comprehensive collection of Agent Skills for context engineering, multi-agent architectures, and production agent systems. Use when building, optimi... + +2. [login?return_to=%2Fruvnet%2Fclaude-flow](https://github.com/login?return_to=%2Fruvnet%2Fclaude-flow) - **GitHub Trending** + > 🌊 The leading agent orchestration platform for Claude. Deploy intelligent multi-agent swarms, coordinate autonomous workflows, and build conversation... + +3. [Search More, Think Less: Rethinking Long-Horizon Agentic Search for Efficiency and Generalization](https://huggingface.co/papers/2602.22675) - **Hugging Face** + > Recent deep research agents primarily improve performance by scaling reasoning depth, but this leads to high inference cost and latency in search-inte... + +4. [AgentDropoutV2: Optimizing Information Flow in Multi-Agent Systems via Test-Time Rectify-or-Reject Pruning](https://huggingface.co/papers/2602.23258) - **Hugging Face** + > While Multi-Agent Systems (MAS) excel in complex reasoning, they suffer from the cascading impact of erroneous information generated by individual par... + +5. [Accelerating Diffusion via Hybrid Data-Pipeline Parallelism Based on Conditional Guidance Scheduling](https://huggingface.co/papers/2602.21760) - **Hugging Face** + > Diffusion models have achieved remarkable progress in high-fidelity image, video, and audio generation, yet inference remains computationally expensiv... + +6. [Exploratory Memory-Augmented LLM Agent via Hybrid On- and Off-Policy Optimization](https://huggingface.co/papers/2602.23008) - **Hugging Face** + > Exploration remains the key bottleneck for large language model agents trained with reinforcement learning. While prior methods exploit pretrained kno... + +7. [login?return_to=%2Fruvnet%2Fwifi-densepose](https://github.com/login?return_to=%2Fruvnet%2Fwifi-densepose) - **GitHub Trending** + > Production-ready implementation of InvisPose - a revolutionary WiFi-based dense human pose estimation system that enables real-time full-body tracking... + +8. [login?return_to=%2Fbytedance%2Fdeer-flow](https://github.com/login?return_to=%2Fbytedance%2Fdeer-flow) - **GitHub Trending** + > An open-source SuperAgent harness that researches, codes, and creates. With the help of sandboxes, memories, tools, skills and subagents, it handles d... + +9. [login?return_to=%2Fmoonshine-ai%2Fmoonshine](https://github.com/login?return_to=%2Fmoonshine-ai%2Fmoonshine) - **GitHub Trending** + > Fast and accurate automatic speech recognition (ASR) for edge devices + +10. [sponsors/obra](https://github.com/sponsors/obra) - **GitHub Trending** + > An agentic skills framework & software development methodology that works. + +## 📂 分类汇总 + +### Agent 框架 + +- [Toward Expert Investment Teams:A Multi-Agent LLM System with Fine-Grained Trading Tasks](http://arxiv.org/abs/2602.23330v1) - arXiv +- [AgentDropoutV2: Optimizing Information Flow in Multi-Agent Systems via Test-Time Rectify-or-Reject Pruning](http://arxiv.org/abs/2602.23258v1) - arXiv + +### AI 基础设施 / 推理优化 + +- [Bitwise Systolic Array Architecture for Runtime-Reconfigurable Multi-precision Quantized Multiplication on Hardware Accelerators](http://arxiv.org/abs/2602.23334v1) - arXiv +- [Invariant Transformation and Resampling based Epistemic-Uncertainty Reduction](http://arxiv.org/abs/2602.23315v1) - arXiv +- [Agency and Architectural Limits: Why Optimization-Based Systems Cannot Be Norm-Responsive](http://arxiv.org/abs/2602.23239v1) - arXiv +- [InnerQ: Hardware-aware Tuning-free Quantization of KV Cache for Large Language Models](http://arxiv.org/abs/2602.23200v1) - arXiv +- [Assessing Deanonymization Risks with Stylometry-Assisted LLM Agent](http://arxiv.org/abs/2602.23079v1) - arXiv +- [Rejection Mixing: Fast Semantic Propagation of Mask Tokens for Efficient DLLM Inference](http://arxiv.org/abs/2602.22868v1) - arXiv +- [Differentiable Zero-One Loss via Hypersimplex Projections](http://arxiv.org/abs/2602.23336v1) - arXiv +- [FairQuant: Fairness-Aware Mixed-Precision Quantization for Medical Image Classification](http://arxiv.org/abs/2602.23192v1) - arXiv + +--- +*Generated by AINewsCollector* diff --git a/skill/ai-news-collector/collect.js b/skill/ai-news-collector/collect.js index d89eb73..6646ea4 100644 --- a/skill/ai-news-collector/collect.js +++ b/skill/ai-news-collector/collect.js @@ -6,32 +6,28 @@ const fs = require('fs'); const path = require('path'); -const https = require('https'); -const http = require('http'); +const { execSync } = require('child_process'); // 配置 const CONFIG_PATH = path.join(__dirname, '../../config.json'); const CACHE_PATH = path.join(__dirname, '../../cache/seen_urls.json'); const DAILY_DIR = path.join(__dirname, '../../daily'); -// HTTP 请求封装 +// 代理配置 +const PROXY_URL = process.env.HTTP_PROXY || process.env.HTTPS_PROXY || 'http://127.0.0.1:7890'; + +// 使用 curl 子进程请求(稳定支持代理) function fetch(url) { - return new Promise((resolve, reject) => { - const client = url.startsWith('https') ? https : http; - const req = client.get(url, { - headers: { - 'User-Agent': 'Mozilla/5.0 (compatible; AINewsCollector/1.0)', - 'Accept': 'application/json, text/html, */*' - }, - timeout: 30000 - }, (res) => { - let data = ''; - res.on('data', chunk => data += chunk); - res.on('end', () => resolve({ data, status: res.statusCode })); - }); - req.on('error', reject); - req.on('timeout', () => { req.destroy(); reject(new Error('Timeout')); }); - }); + try { + const proxyFlag = PROXY_URL ? `--proxy "${PROXY_URL}"` : ''; + const data = execSync( + `curl -s ${proxyFlag} -L --max-time 30 -H "User-Agent: Mozilla/5.0 (compatible; AINewsCollector/1.0)" "${url}"`, + { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 } + ); + return { data, status: 200 }; + } catch (err) { + throw new Error(`Fetch failed: ${err.message}`); + } } // ============ 数据采集器 ============ @@ -39,32 +35,46 @@ function fetch(url) { async function collectArxiv(config) { const items = []; const categories = config.categories || ['cs.AI', 'cs.CL', 'cs.LG']; - + + // 计算48小时前的日期(用于过滤) + const now = new Date(); + const twoDaysAgo = new Date(now.getTime() - 48 * 60 * 60 * 1000); + for (const cat of categories) { try { - const url = `http://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&maxResults=${config.maxResults || 50}`; + // 使用 lastUpdatedDate 排序获取最新论文,然后在代码里过滤 + const url = `https://export.arxiv.org/api/query?search_query=cat:${cat}&sortBy=lastUpdatedDate&sortOrder=descending&max_results=${config.maxResults || 50}`; + + console.log(` 查询: ${cat}`); const { data } = await fetch(url); - + const entries = data.split('').slice(1); - + for (const entry of entries) { const title = entry.match(/([\s\S]*?)<\/title>/)?.[1]?.trim().replace(/\n/g, ' '); const summary = entry.match(/<summary>([\s\S]*?)<\/summary>/)?.[1]?.trim().replace(/\n/g, ' ').slice(0, 200); const link = entry.match(/<id>([\s\S]*?)<\/id>/)?.[1]?.trim(); - + const updated = entry.match(/<updated>([\s\S]*?)<\/updated>/)?.[1]?.trim(); + const published = entry.match(/<published>([\s\S]*?)<\/published>/)?.[1]?.trim(); + if (title && link) { - items.push({ - title, - summary: summary || '', - url: link, - source: 'arXiv', - sourceWeight: config.weight || 6, - date: new Date().toISOString().split('T')[0] - }); + const publishedDate = published ? new Date(published) : null; + + // 只保留最近48小时内发布的论文 + if (publishedDate && publishedDate >= twoDaysAgo) { + items.push({ + title, + summary: summary || '', + url: link, + source: 'arXiv', + sourceWeight: config.weight || 6, + date: published ? published.split('T')[0] : new Date().toISOString().split('T')[0] + }); + } } } } catch (err) { - console.error(`[arXiv] Error:`, err.message); + console.error(`[arXiv] Error for ${cat}:`, err.message); } } return items; @@ -73,9 +83,15 @@ async function collectArxiv(config) { async function collectHuggingFace(config) { const items = []; try { - const { data } = await fetch(`https://huggingface.co/api/daily-papers?limit=${config.maxResults || 30}`); + // 使用正确的 API 端点:daily_papers(下划线),不需要认证 + const { data } = await fetch(`https://huggingface.co/api/daily_papers?limit=${config.maxResults || 30}`); const papers = JSON.parse(data); - + + if (!Array.isArray(papers)) { + console.error('[HuggingFace] Error: API返回的不是数组'); + return items; + } + for (const paper of papers) { items.push({ title: paper.title || paper.paper?.title || 'Untitled',