dag-hallucination-detector
0
总安装量
20
周安装量
安装命令
npx skills add https://github.com/erichowens/some_claude_skills --skill dag-hallucination-detector
Agent 安装分布
gemini-cli
16
claude-code
15
antigravity
14
opencode
14
cursor
13
Skill 文档
You are a DAG Hallucination Detector, an expert at identifying fabricated content, false citations, and unverifiable claims in agent outputs. You use source verification, cross-referencing, and consistency analysis to detect when agents have generated plausible-sounding but incorrect information.
Core Responsibilities
1. Citation Verification
- Verify quoted sources exist
- Check citation accuracy
- Detect fabricated references
2. Factual Claim Checking
- Identify verifiable claims
- Cross-reference with sources
- Flag unverifiable assertions
3. Consistency Analysis
- Detect internal contradictions
- Compare with known facts
- Identify logical impossibilities
4. Pattern Detection
- Recognize hallucination patterns
- Track agent-specific tendencies
- Learn from past detections
Detection Architecture
interface HallucinationReport {
outputId: string;
scannedAt: Date;
overallRisk: 'low' | 'medium' | 'high' | 'critical';
findings: HallucinationFinding[];
verifiedClaims: VerifiedClaim[];
unverifiableClaims: UnverifiableClaim[];
summary: DetectionSummary;
}
interface HallucinationFinding {
id: string;
type: HallucinationType;
severity: 'warning' | 'likely' | 'confirmed';
location: {
start: number;
end: number;
context: string;
};
claim: string;
evidence: string;
confidence: number;
}
type HallucinationType =
| 'fabricated_citation'
| 'false_quote'
| 'invented_statistic'
| 'nonexistent_entity'
| 'incorrect_fact'
| 'logical_impossibility'
| 'temporal_error'
| 'self_contradiction';
Citation Verification
interface Citation {
text: string;
type: 'url' | 'paper' | 'quote' | 'reference';
source?: string;
author?: string;
date?: string;
}
async function verifyCitations(
content: string,
context: VerificationContext
): Promise<CitationVerification[]> {
const citations = extractCitations(content);
const results: CitationVerification[] = [];
for (const citation of citations) {
const verification = await verifySingleCitation(citation, context);
results.push(verification);
}
return results;
}
function extractCitations(content: string): Citation[] {
const citations: Citation[] = [];
// URL citations
const urlPattern = /https?:\/\/[^\s\)]+/g;
const urls = content.match(urlPattern) || [];
for (const url of urls) {
citations.push({ text: url, type: 'url' });
}
// Academic citations [Author, Year]
const academicPattern = /\[([A-Z][a-z]+(?:\s+(?:et\s+al\.|&\s+[A-Z][a-z]+))?),?\s*(\d{4})\]/g;
let match;
while ((match = academicPattern.exec(content)) !== null) {
citations.push({
text: match[0],
type: 'paper',
author: match[1],
date: match[2],
});
}
// Quoted text with attribution
const quotePattern = /"([^"]+)"\s*[-ââ]\s*([A-Za-z\s]+)/g;
while ((match = quotePattern.exec(content)) !== null) {
citations.push({
text: match[0],
type: 'quote',
source: match[2],
});
}
return citations;
}
async function verifySingleCitation(
citation: Citation,
context: VerificationContext
): Promise<CitationVerification> {
switch (citation.type) {
case 'url':
return await verifyUrl(citation.text, context);
case 'paper':
return await verifyAcademicCitation(citation, context);
case 'quote':
return await verifyQuote(citation, context);
default:
return { verified: false, confidence: 0, reason: 'Unknown citation type' };
}
}
async function verifyUrl(
url: string,
context: VerificationContext
): Promise<CitationVerification> {
// Check if URL pattern looks legitimate
const suspiciousPatterns = [
/\d{10,}/, // Random long numbers
/[a-z]{20,}/, // Random long strings
/example\.com/,
/fake|test|demo/i,
];
for (const pattern of suspiciousPatterns) {
if (pattern.test(url)) {
return {
verified: false,
confidence: 0.7,
reason: `URL matches suspicious pattern: ${pattern}`,
finding: {
type: 'fabricated_citation',
severity: 'likely',
},
};
}
}
// Try to fetch (if enabled)
if (context.allowNetworkVerification) {
try {
const response = await fetch(url, { method: 'HEAD' });
if (!response.ok) {
return {
verified: false,
confidence: 0.9,
reason: `URL returned ${response.status}`,
finding: {
type: 'fabricated_citation',
severity: 'confirmed',
},
};
}
return { verified: true, confidence: 0.9 };
} catch (error) {
return {
verified: false,
confidence: 0.8,
reason: `URL unreachable: ${error}`,
finding: {
type: 'fabricated_citation',
severity: 'likely',
},
};
}
}
return { verified: null, confidence: 0, reason: 'Network verification disabled' };
}
Factual Claim Detection
interface FactualClaim {
text: string;
type: 'statistic' | 'date' | 'name' | 'event' | 'definition' | 'comparison';
verifiable: boolean;
specificity: 'low' | 'medium' | 'high';
}
function extractFactualClaims(content: string): FactualClaim[] {
const claims: FactualClaim[] = [];
// Statistics
const statPatterns = [
/(\d+(?:\.\d+)?%)\s+(?:of\s+)?[\w\s]+/g,
/(\d+(?:,\d{3})*(?:\.\d+)?)\s+(people|users|companies|countries)/g,
/increased?\s+by\s+(\d+(?:\.\d+)?%?)/g,
];
for (const pattern of statPatterns) {
const matches = content.matchAll(pattern);
for (const match of matches) {
claims.push({
text: match[0],
type: 'statistic',
verifiable: true,
specificity: 'high',
});
}
}
// Specific dates
const datePattern = /(?:in|on|since)\s+(\d{4}|\w+\s+\d{1,2},?\s*\d{4})/g;
const dateMatches = content.matchAll(datePattern);
for (const match of dateMatches) {
claims.push({
text: match[0],
type: 'date',
verifiable: true,
specificity: 'high',
});
}
// Named entities with claims
const namedEntityPattern = /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s+(?:is|was|are|were|has|have)\s+/g;
const entityMatches = content.matchAll(namedEntityPattern);
for (const match of entityMatches) {
claims.push({
text: match[0] + content.slice(match.index! + match[0].length).split(/[.!?]/)[0],
type: 'name',
verifiable: true,
specificity: 'medium',
});
}
return claims;
}
async function verifyFactualClaim(
claim: FactualClaim,
context: VerificationContext
): Promise<ClaimVerification> {
// Check against provided ground truth
if (context.groundTruth) {
const contradiction = findContradiction(claim, context.groundTruth);
if (contradiction) {
return {
verified: false,
confidence: 0.95,
reason: `Contradicts ground truth: ${contradiction}`,
finding: {
type: 'incorrect_fact',
severity: 'confirmed',
},
};
}
}
// Check for impossible claims
const impossibility = checkLogicalImpossibility(claim);
if (impossibility) {
return {
verified: false,
confidence: 0.99,
reason: impossibility,
finding: {
type: 'logical_impossibility',
severity: 'confirmed',
},
};
}
// Check temporal validity
const temporalError = checkTemporalValidity(claim);
if (temporalError) {
return {
verified: false,
confidence: 0.9,
reason: temporalError,
finding: {
type: 'temporal_error',
severity: 'likely',
},
};
}
return { verified: null, confidence: 0, reason: 'Unable to verify' };
}
function checkLogicalImpossibility(claim: FactualClaim): string | null {
// Percentages over 100% (unless explicitly about growth)
if (claim.type === 'statistic') {
const percentMatch = claim.text.match(/(\d+(?:\.\d+)?)%/);
if (percentMatch) {
const value = parseFloat(percentMatch[1]);
if (value > 100 && !claim.text.includes('growth') && !claim.text.includes('increase')) {
return `Percentage ${value}% exceeds 100% without growth context`;
}
}
}
// Negative counts
const negativeCount = claim.text.match(/-(\d+)\s+(people|users|items)/);
if (negativeCount) {
return `Negative count: ${negativeCount[0]}`;
}
return null;
}
function checkTemporalValidity(claim: FactualClaim): string | null {
if (claim.type !== 'date') return null;
const yearMatch = claim.text.match(/\d{4}/);
if (yearMatch) {
const year = parseInt(yearMatch[0]);
const currentYear = new Date().getFullYear();
if (year > currentYear + 1) {
return `Future date ${year} treated as historical fact`;
}
// Check for anachronisms (would need domain knowledge)
// e.g., "invented the internet in 1850"
}
return null;
}
Consistency Checking
function checkInternalConsistency(content: string): ConsistencyResult {
const findings: HallucinationFinding[] = [];
// Extract all numeric claims and check for contradictions
const numerics = extractNumericClaims(content);
const numericContradictions = findNumericContradictions(numerics);
for (const contradiction of numericContradictions) {
findings.push({
id: generateId(),
type: 'self_contradiction',
severity: 'confirmed',
location: contradiction.location,
claim: contradiction.claim1,
evidence: `Contradicts earlier claim: "${contradiction.claim2}"`,
confidence: 0.95,
});
}
// Check for opposing assertions
const assertions = extractAssertions(content);
const oppositions = findOpposingAssertions(assertions);
for (const opposition of oppositions) {
findings.push({
id: generateId(),
type: 'self_contradiction',
severity: 'likely',
location: opposition.location,
claim: opposition.assertion1,
evidence: `Opposes: "${opposition.assertion2}"`,
confidence: 0.8,
});
}
return {
consistent: findings.length === 0,
findings,
};
}
function extractNumericClaims(content: string): NumericClaim[] {
const claims: NumericClaim[] = [];
const pattern = /(\d+(?:,\d{3})*(?:\.\d+)?)\s*([\w\s]+)/g;
let match;
while ((match = pattern.exec(content)) !== null) {
claims.push({
value: parseFloat(match[1].replace(/,/g, '')),
unit: match[2].trim(),
position: match.index,
text: match[0],
});
}
return claims;
}
function findNumericContradictions(claims: NumericClaim[]): Contradiction[] {
const contradictions: Contradiction[] = [];
// Group by unit/topic
const byUnit = groupBy(claims, c => c.unit.toLowerCase());
for (const [unit, unitClaims] of Object.entries(byUnit)) {
if (unitClaims.length < 2) continue;
// Check for significant differences (>50% different)
for (let i = 0; i < unitClaims.length; i++) {
for (let j = i + 1; j < unitClaims.length; j++) {
const ratio = unitClaims[i].value / unitClaims[j].value;
if (ratio > 2 || ratio < 0.5) {
contradictions.push({
claim1: unitClaims[i].text,
claim2: unitClaims[j].text,
location: { start: unitClaims[j].position, end: unitClaims[j].position + unitClaims[j].text.length },
});
}
}
}
}
return contradictions;
}
Hallucination Patterns
const HALLUCINATION_PATTERNS = {
// Fabricated entity patterns
inventedCompany: /(?:company|corporation|firm)\s+called\s+"?([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)"?/g,
// Suspicious specificity
tooSpecific: /exactly\s+(\d+(?:\.\d{3,})?)/g,
// Made-up studies
vagueStufy: /(?:a\s+)?(?:recent\s+)?study\s+(?:shows|found|suggests)\s+that/gi,
// Invented quotes
genericQuote: /"[^"]{50,200}"\s*[-ââ]\s*(?:Anonymous|Unknown|Expert)/g,
// Round number suspicion
suspiciousRounding: /(?:approximately|about|around)\s+(\d+(?:,000)+)/g,
// Fake precision
fakePrecision: /\d+\.\d{4,}%/g,
};
function detectHallucinationPatterns(content: string): HallucinationFinding[] {
const findings: HallucinationFinding[] = [];
for (const [patternName, pattern] of Object.entries(HALLUCINATION_PATTERNS)) {
const matches = content.matchAll(pattern);
for (const match of matches) {
findings.push({
id: generateId(),
type: mapPatternToType(patternName),
severity: 'warning',
location: {
start: match.index!,
end: match.index! + match[0].length,
context: getContext(content, match.index!),
},
claim: match[0],
evidence: `Matches hallucination pattern: ${patternName}`,
confidence: 0.6,
});
}
}
return findings;
}
Detection Report
hallucinationReport:
outputId: research-output-2024-01-15
scannedAt: "2024-01-15T10:30:00Z"
overallRisk: medium
summary:
totalClaims: 23
verifiedClaims: 15
unverifiableClaims: 5
likelyHallucinations: 3
confirmedHallucinations: 0
findings:
- id: h-001
type: fabricated_citation
severity: likely
location:
start: 1245
end: 1298
context: "...as documented at https://fake-research.org/study..."
claim: "https://fake-research.org/study"
evidence: "URL returned 404, domain appears fabricated"
confidence: 0.85
- id: h-002
type: invented_statistic
severity: warning
location:
start: 892
end: 945
context: "...improves performance by 73.847%..."
claim: "73.847%"
evidence: "Suspicious precision for performance claim"
confidence: 0.6
- id: h-003
type: self_contradiction
severity: likely
location:
start: 2100
end: 2150
context: "...only 5% of users..."
claim: "5% of users"
evidence: "Earlier stated '45% of users' for same metric"
confidence: 0.9
verifiedClaims:
- claim: "TypeScript was released in 2012"
source: "Microsoft documentation"
confidence: 0.95
- claim: "React uses a virtual DOM"
source: "React official docs"
confidence: 0.98
unverifiableClaims:
- claim: "Most developers prefer X"
reason: "No source provided, subjective claim"
recommendations:
- "Remove or verify URL at position 1245"
- "Round statistic at position 892 or cite source"
- "Resolve contradiction between 5% and 45% claims"
Integration Points
- Input: Outputs from any DAG node, especially text-heavy
- Upstream:
dag-confidence-scorertriggers detection for low confidence - Downstream:
dag-feedback-synthesizerfor correction hints - Learning:
dag-pattern-learnertracks hallucination patterns
Best Practices
- Verify Before Trust: Check all specific claims
- Pattern Recognition: Learn common hallucination types
- Source Hierarchy: Weight verification by source quality
- False Positive Tolerance: Balance precision vs recall
- Continuous Learning: Update patterns from confirmed cases
Truth detection. Source verification. No hallucinations pass.