// <nowiki>
(function (mw) {
"use strict";
if (
mw.config.get("wgNamespaceNumber") !== 0 ||
mw.config.get("wgAction") !== "view" ||
mw.config.get("wgArticleId") === 0
) {
return;
}
// ==== 01 Constants and Config =============================================
const TOOL_NAME = "WikidataCleanup";
const PID_ARCHIVE_DATE = "P2960";
const PID_ARCHIVE_URL = "P1065";
const PID_CITES_WORK = "P2860";
const PID_CITIZENSHIP = "P27";
const PID_DATE_OF_BIRTH = "P569";
const PID_DATE_OF_DEATH = "P570";
const PID_END_TIME = "P582";
const PID_IMPORTED_FROM = "P143";
const PID_OCCUPATION = "P106";
const PID_REASON_FOR_DEPRECATED_RANK = "P2241";
const PID_REASON_FOR_PREFERRED_RANK = "P7452";
const PID_REFERENCE_URL = "P854";
const PID_RETRIEVED = "P813";
const PID_START_TIME = "P580";
const PID_STATED_IN = "P248";
const PID_SUBCLASS_OF = "P279";
const PID_SUBJECT_NAMED_AS = "P1810";
const PID_TITLE = "P1476";
const PID_URL = "P2699";
const PID_WIKIMEDIA_IMPORT_URL = "P4656";
const QID_LESS_PRECISE = "Q42727519"; // item/value with less precision and/or accuracy
const QID_MOST_PRECISE = "Q71536040"; // most precise value (reason for preferred rank)
const WIKIDATA_ENTITY_BASE_URL = "http://www.wikidata.org/entity/";
const URL_PROLEPTIC_JULIAN_CALENDAR = WIKIDATA_ENTITY_BASE_URL + "Q1985786";
const URL_PROLEPTIC_GREGORIAN_CALENDAR =
WIKIDATA_ENTITY_BASE_URL + "Q1985727";
// German ID sources
const PID_GND_ID = "P227";
const PID_DEUTSCHE_BIOGRAPHIE_GND_ID = "P7902";
// Inferred
const PID_INFERRED = "P3452";
const PID_DETERMINATION_METHOD = "P459";
const PID_MATCHED_BY_IDENTIFIER_FROM = "P11797";
const PID_BASED_ON_HEURISTIC = "P887";
// Described-by-source / absorb-url detector
const PID_DESCRIBED_BY_SOURCE = "P1343";
// ABSORB_URL_CLAIM_SKIP: ext-id properties that should never be absorption
// targets for certain item types, because they identify content items rather
// than the subject itself. If the item's P31 matches any listed QID, the
// URL claim match is skipped entirely (no diff emitted).
//
// Example: P1651 (YouTube video ID) identifies a video, not the person who
// uploaded it — so a YouTube URL on a human item should not be absorbed into
// a video ID statement, even if one happens to exist on the item.
const ABSORB_URL_CLAIM_SKIP = {
P1651: ["Q5"], // YouTube video ID — skip for humans
P2969: ["Q5"], // Goodreads version/edition ID (P2969) — skip for humans
P10304: ["Q5"], // Apple Podcasts episode ID (P10304) — skip for humans
P9882: ["Q5"], // Spotify episode ID (P9882) — skip for humans
P5842: ["Q5"], // Apple Podcasts podcast ID (P5842)
// Add further entries as new cases are found:
// "PXXX": ["QYYY", "QZZZ"],
};
// ABSORB_URL_CLAIM_PLATFORM_PAIRS: when a URL resolves to a "content item"
// ext-id property (e.g. Amazon book ID P5749) but the item already has a
// related "creator/author" property on the same platform (e.g. Amazon author
// ID P4862), the URL claim is redundant and should be removed — with the
// creator property cited as the reason.
//
// Structure: contentPid -> { creatorPid, itemTypes: [P31 QIDs] }
// The removal only fires when the item's P31 matches one of the itemTypes
// AND the item has at least one non-deprecated claim for creatorPid.
const ABSORB_URL_CLAIM_PLATFORM_PAIRS = {
P5749: { creatorPid: "P4862", itemTypes: ["Q5"] }, // Amazon book ID -> Amazon author ID
P2969: { creatorPid: "P2963", itemTypes: ["Q5"] }, // Goodreads version/edition ID -> Goodreads author ID
// Add further platform pairs as needed:
// "PXXX": { creatorPid: "PYYY", itemTypes: ["QZZZ"] },
};
// ==== Developer flags =====================================================
// Set to true in test builds only — allows detectAbsorbUrlClaim to propose
// converting a URL claim into a new ext-id statement when the ext-id does not
// yet exist on the item. Never enable in production.
const DEV_ABSORB_URL_CLAIM_CREATE_MISSING = false;
// extra checks for Wikimedia import references with no sitelinks (only enabled in dev mode)
const DEV_WIKIMEDIA_NO_SITELINKS = false;
// when true, detectFixCopiedLabels runs on all item types; when false (production),
// only humans (P31=Q5) are checked to avoid false positives from Wikipedia's
// capitalisation conventions and comma-based geographic qualifications.
const DEV_FIX_COPIED_LABELS_ALL_TYPES = false;
// Actions
const ACTION_REMOVE_REFS = "removeRefs";
const ACTION_REMOVE_CLAIM = "removeClaim";
const ACTION_MERGE_CLAIM = "mergeClaim";
const ACTION_NORMALIZE = "normalize";
const ACTION_REMOVE_ALIAS = "removeAlias";
const ACTION_DOWNGRADE_PREFERRED = "downgradePreferred";
const ACTION_CHANGE_PROPERTY = "changeProperty";
const ACTION_CHANGE_VALUE = "changeValue";
const ACTION_REMOVE_QUALIFIER = "removeQualifier";
const ACTION_CLEAN_URL = "cleanUrl";
const ACTION_MOVE_QUALIFIER_TO_REFERENCE = "moveQualifierToReference";
const ACTION_SPLIT_REFERENCE_URLS = "splitReferenceUrls";
const ACTION_ADD_EXTERNAL_ID_TO_REFERENCE = "addExternalIdToReference";
const ACTION_ABSORB_CLAIM = "absorbClaim";
const ACTION_ABSORB_URL_CLAIM = "absorbUrlClaim";
const ACTION_CONVERT_URL_TO_EXT_ID = "convertUrlToExtId";
const ACTION_MERGE_DUPLICATE_URL_CLAIMS = "mergeDuplicateUrlClaims";
const ACTION_UPGRADE_PRECISE_DATE = "upgradePreciseDate";
const ACTION_REMOVE_REDUNDANT_REF_URL = "removeRedundantRefUrl";
const ACTION_DEPRECATE_URL_CLAIM = "deprecateUrlClaim";
const ACTION_SET_MUL_LABEL = "setMulLabel";
const ACTION_ADD_MUL_ALIAS = "addMulAlias";
const ACTION_REMOVE_OBSOLETE_SNAKS = "removeObsoleteSnaks";
// Timing / cache
const ONE_DAY = 1000 * 60 * 60 * 24;
const OCC_CACHE_KEY = "wd_cleanup_occupationParents_v1";
const OCC_CACHE_TTL_MS = ONE_DAY * 7;
const WIKIPEDIA_EDITIONS_CACHE_KEY = "wd_cleanup_wikipediaEditions_v3";
const WIKIPEDIA_EDITIONS_TTL_MS = ONE_DAY * 7;
const LANG_NAMES_CACHE_KEY = "wd_cleanup_langNames_v1";
const LANG_NAMES_TTL_MS = ONE_DAY * 7;
const OBSOLETE_IDS_CACHE_KEY = "wd_cleanup_obsoleteIds_v1";
const OBSOLETE_IDS_TTL_MS = ONE_DAY * 7;
const REGEX_CACHE_KEY = "wd_cleanup_propertyRegex_v1";
const REGEX_CACHE_TTL_MS = ONE_DAY * 7;
const URL_PATTERNS_CACHE_KEY = "wd_cleanup_propertyUrlPatterns_v1";
const URL_PATTERNS_TTL_MS = ONE_DAY * 7;
const STATED_IN_CACHE_KEY = "wd_cleanup_propertyStatedIn_v3";
const STATED_IN_TTL_MS = ONE_DAY * 7;
const URL_STRIP_CACHE_KEY = "wd_cleanup_urlStrip_v1";
const URL_STRIP_CACHE_TTL_MS = ONE_DAY * 7;
const URL_STRIP_PAGE = "User:Difool/url_tracking_params";
const URL_BLOCKLIST_CACHE_KEY = "wd_cleanup_urlBlocklist_v1";
const URL_BLOCKLIST_CACHE_TTL_MS = ONE_DAY * 7;
const URL_BLOCKLIST_PAGE = "User:Difool/URL-deprecation-blocklist";
const SOURCE_CATEGORIES_CACHE_KEY = "wd_cleanup_sourceCategories_v1";
const SOURCE_CATEGORIES_TTL_MS = ONE_DAY * 7;
const SOURCE_CATEGORIES_PAGE = "User:Difool/reference-source-categories";
const CHUNK_SIZE = 50; // wbgetentities page size
const MAX_TRAVERSAL_DEPTH = 3; // P279 BFS depth limit
const FETCH_BACKOFF_MS = 2 * 60 * 60 * 1000; // 2 h
const FETCH_FAILURE_TIMES_KEY = `${TOOL_NAME}_fetchFailureTimes`;
const INDEXEDDB_NAME = `${TOOL_NAME}_DB`;
const INDEXEDDB_VERSION = 2;
const precisionLabels = {
6: "millennium",
7: "century",
8: "decade",
9: "year",
10: "month",
11: "day",
};
// ==== 02 Internationalization =============================================
const i18n = {
en: {
cleanupTitle: "🧹 Cleanup Tool",
cleanupIntro: "The following cleanups are possible:",
runButton: "Run cleanup",
settingsButton: "⚙ Settings",
noCleanups: "No cleanups found",
settingsTitle: "Cleanup Tool Settings",
close: "Close",
// detector labels
obsolete: "Remove obsolete external-id references",
wikimedia: "Remove 'imported from Wikimedia project' references",
aggregator: "Remove aggregator references",
community: "Remove community references",
redundant: "Remove FAST if LC is present",
inferred: "Remove inferred-only references",
invalid: "Remove references with invalid external-id values",
normalizeLabels: "Normalize label/description/alias text",
removeAliasEqualsLabel: "Remove alias = label (same language)",
removeLowPrecisionDates:
"Remove redundant low-precision birth/death dates",
upgradePreciseDate:
"Upgrade precise date to preferred rank (demote deprecated less-precise duplicate to normal)",
removeRedundantPreferred:
"Downgrade redundant 'preferred' ranks (remove 'reason for preferred rank' (P7452) if present)",
removeExpiredPreferred:
"Downgrade preferred rank on a statement that has an end time in the past",
mergeSameDateClaims: "Merge date claims with same normalized value",
replaceWrongProperty: "Replace wrong property in claims/references",
moveRetrievedFromExternalId: "Move qualifiers to references",
duplicateValues: "Merge duplicate values",
dupRetrieved: "Remove duplicate references",
removeRedundantOccupation: "Remove redundant occupations",
removeJulianGregorianDates:
"Remove unreferenced Julian/Gregorian duplicate dates",
convertWikipediaStatedIn:
"Replace 'stated in' (P248) with 'imported from Wikimedia project' (P143) for Wikipedia editions",
convertInvalidStatedInReference:
"Fix invalid 'stated in' (P248) in external ID references",
mismatchedWikimediaImport:
"Fix mismatched 'imported from' (P143) vs Wikimedia import URL (P4656)",
removeIdDescriptions: "Remove ID-style descriptions",
removeEmptyEndTime: "Remove empty 'end time' (P582) qualifiers",
addExternalIdToReference:
"Add external ID to reference based on URL match",
splitMultipleReferenceUrls:
"Split multiple reference URLs into separate references",
cleanUrls: "Clean up reference URLs",
removeSelfCite: "Remove self-citations",
absorbDescribedBySource:
"Remove 'described by source' (P1343) redundant with an external ID's applicable 'stated in'",
absorbUrlClaim:
"Remove URL claim redundant with a matching external ID on the same item",
convertUrlToExtId: "Convert URL claim to external ID (dev mode)",
duplicateUrlClaims:
"Merge duplicate URL property values (same URL on multiple properties)",
removeRedundantRefUrl:
"Remove reference URL (P854) redundant with the statement's own URL value",
blocklistedUrlClaims:
"Deprecate or remove URL claims matching the blocklist",
partial_obsolete: "Remove obsolete identifiers from references",
blocklistAction: "Action",
blocklistReason: "Reason",
extractedId: "Extracted ID",
urlClaim: "URL claim",
addMulLabel:
"Add default (mul) label for humans with identical en/de/fr labels",
mulLabelValue: "Label value",
mulLabelLanguages: "Matching languages",
addMulAlias:
"Add default (mul) alias and remove per-language duplicates (≥6 languages)",
mulAliasLangCount: "Language count",
self_stated_in:
"Remove tautological 'stated in' (P248) references on external-ID claims",
redundantCitizenshipDates:
"Remove redundant start/end time on country of citizenship (P27) matching date of birth/death",
fixCopiedLabel:
"Replace label copied from English with the local Wikipedia title",
fixCopiedLabelLang: "Language",
fixCopiedLabelBefore: "Current label (= English)",
fixCopiedLabelAfter: "Wikipedia title",
// settings sections
generalSettings: "General",
detectorSettings: "Options",
cacheSettings: "Cache & Buffers",
// column headers
property: "Property",
removedValues: "Removed values",
field: "Field",
lang: "Language",
original: "Original text",
normalized: "Normalized text",
aliasLabel: "Alias identical to label",
value: "Value",
precision: "Precision",
p7452: "P7452 present?",
mergedInto: "Merged into",
context: "Context",
oldProperty: "Old property",
newProperty: "New property",
reference: "Reference",
removed: "Removed",
becauseOf: "Because of",
description: "Description text",
idPresent: "ID property present",
suggestedProperty: "Suggested Property",
extractedId: "Extracted ID",
referenceUrl: "Reference URL",
externalIdProperty: "External ID property",
externalIdClaim: "Matching external ID",
oldValue: "Old value",
newValue: "New value",
count: "Count",
// menu
startPreview: "Preview cleanup",
runPreview: "Run a preview of all cleanup changes",
autoStart: "Auto-start cleanup preview on page load",
applyFailed: "Failed to apply cleanup.",
cleanupFailed: "Cleanup failed.",
yes: "yes",
no: "no",
},
nl: {
cleanupTitle: "🧹 Opschoontool",
cleanupIntro: "De volgende opschoonacties zijn mogelijk:",
runButton: "Opschonen uitvoeren",
settingsButton: "⚙ Instellingen",
noCleanups: "Geen opschoonacties gevonden",
settingsTitle: "Instellingen voor opschoonacties",
save: "Opslaan",
cancel: "Annuleren",
wikimedia: "Verwijder 'geïmporteerd uit Wikimedia-project'-referenties",
aggregator: "Verwijder aggregatorreferenties",
community: "Verwijder communityreferenties",
redundant: "Verwijder FAST als LC aanwezig is",
inferred: "Verwijder alleen-afgeleide referenties",
normalizeLabels: "Normaliseer label-/beschrijving-/alias-tekst",
removeAliasEqualsLabel: "Verwijder alias = label (zelfde taal)",
removeLowPrecisionDates:
"Verwijder onnauwkeurige dubbele geboortedata/sterfdata",
removeRedundantPreferred:
"Verlaag overbodige 'voorkeurs'-rangen (verwijder 'reden voor voorkeursrang' (P7452) indien aanwezig)",
mergeSameDateClaims:
"Datumclaims met dezelfde genormaliseerde waarde samenvoegen",
property: "Eigenschap",
removedValues: "Verwijderde waarden",
field: "Veld",
lang: "Taal",
original: "Oorspronkelijke tekst",
normalized: "Genormaliseerde tekst",
aliasLabel: "Alias identiek aan label",
value: "Waarde",
precision: "Precisie",
p7452: "P7452 aanwezig?",
mergedInto: "Samengevoegd met",
startPreview: "Opschoonpreview",
runPreview: "Bekijk de mogelijke opschoonacties",
autoStart: "Preview automatisch starten",
applyFailed: "Uitvoeren mislukt",
cleanupFailed: "Opschonen mislukt",
yes: "Ja",
no: "Nee",
},
};
/** Return the best available translation for `key`. */
function getMsg(key) {
for (const lang of mw.language.getFallbackLanguageChain()) {
if (i18n[lang]?.[key]) return i18n[lang][key];
}
return i18n.en[key] || key;
}
// ==== 03 Utilities ========================================================
function isQid(id) {
return typeof id === "string" && /^Q\d+$/.test(id);
}
function uniq(arr) {
return [...new Set(arr.filter(isQid))];
}
/**
* Normalise display text: replace fancy characters, collapse whitespace,
* and strip leading/trailing commas or spaces.
*/
function normalizeText(str) {
if (!str) return str;
return str
.replace(/\u2010/g, "-") // Unicode hyphen -> ASCII hyphen
.replace(/\u00A0/g, " ") // non-breaking space -> space
.replace(/^[,\s]+|[,\s]+$/g, "")
.replace(/\s+/g, " ");
}
/** Trim and percent-decode a URL string. */
function normalizeUrl(urlValue) {
return decodeURIComponent(urlValue.trim());
}
function removeTrailingSlash(url) {
return typeof url === "string" && url.endsWith("/")
? url.slice(0, -1)
: url;
}
/**
* Clean tracking/functional parameters from a URL.
* @param {string} rawUrl
* @param {boolean} [opts.recognitionMode=false] – also strip functional params
*/
function cleanUrl(rawUrl, { recognitionMode = false } = {}) {
const url = new URL(rawUrl);
let changed = false;
let keepUrl = false;
// Always-remove tracking params per hostname.
// Key conventions (apply to both hardcoded and wiki-sourced rules):
// "example.com" – exact match after stripping leading "www."
// ".example.com" – suffix match: matches example.com, fr.example.com,
// example.com.au, fr.example.com.au, etc.
// Use this for domains with two-part TLDs (co.uk, com.au).
// Plain entries also match via base-domain fallback (last two segments),
// so "linkedin.com" covers fr.linkedin.com, de.linkedin.com, etc.
//
// Hardcoded defaults are merged with rules fetched from [[User:Difool/unrecognized_urls]].
const ALWAYS_STRIP = Object.assign(
{
"imdb.com": ["ref_"],
"m.imdb.com": ["ref_"],
"open.spotify.com": ["si"],
"researchgate.net": ["ev"],
"linkedin.com": [
"originalSubdomain",
"trk",
"success",
"original_referer",
],
".scholar.google": ["oi", "view_op", "sortby", "authuser"],
},
urlStripCache.always,
);
// Remove only in recognition mode (functional / UI params)
const RECOGNITION_STRIP = Object.assign(
{
"youtube.com": ["t", "ab_channel", "mode"],
"open.spotify.com": ["dl_branch", "nd"],
"itunes.apple.com": ["mt"],
},
urlStripCache.recognition,
);
const hostname = url.hostname.replace(/^www\./, "");
// Resolve strip params for a hostname:
// 1. Exact match (e.g. "imdb.com")
// 2. Suffix match for keys starting with "." (e.g. ".scholar.google"
// matches scholar.google.com, scholar.google.com.au, etc.)
// 3. Base-domain fallback — last two segments (e.g. "linkedin.com" matches
// fr.linkedin.com, de.linkedin.com, etc.)
// Note: use "." suffix keys for two-part TLDs like co.uk / com.au.
function paramsFor(map, host) {
// Global wildcard — applies to every hostname
const wildcard = map["*"] || [];
let specific = [];
if (map[host]) {
specific = map[host];
} else {
for (const key of Object.keys(map)) {
if (
key.startsWith(".") &&
(host === key.slice(1) || host.startsWith(key.slice(1) + "."))
) {
specific = map[key];
break;
}
}
if (!specific.length) {
const base = host.split(".").slice(-2).join(".");
specific = map[base] || [];
}
}
// Merge, deduplicating so a param listed globally isn't applied twice
// if a hostname entry also lists it.
return [...new Set([...wildcard, ...specific])];
}
// Remove empty hash
if (url.hash === "#") {
url.hash = "";
changed = true;
}
for (const param of paramsFor(ALWAYS_STRIP, hostname)) {
if (url.searchParams.has(param)) {
url.searchParams.delete(param);
changed = true;
}
}
if (recognitionMode) {
for (const param of paramsFor(RECOGNITION_STRIP, hostname)) {
if (url.searchParams.has(param)) {
url.searchParams.delete(param);
changed = true;
keepUrl = true;
}
}
if (url.hash && url.hash !== "#") {
url.hash = "";
changed = true;
}
for (const param of ["hl", "lang", "locale", "lr"]) {
if (url.searchParams.has(param)) {
url.searchParams.delete(param);
changed = true;
}
}
const trimmed = removeTrailingSlash(url.href);
if (trimmed !== url.href) {
url.href = trimmed;
changed = true;
}
}
let result = rawUrl;
if (changed) {
result = url.href;
if (!recognitionMode && /[^\x00-\x7F]/.test(rawUrl)) {
result = decodeURI(result);
}
}
return { url: result, keepUrl };
}
/** Make invisible/special characters visible for preview tables. */
function visualizeInvisibleChars(str) {
if (!str) return str;
return str
.replace(/\u2010/g, "[HYPHEN]")
.replace(/\u00A0/g, "[NO-BREAK]")
.replace(/ {2,}/g, (m) => `[SPACE×${m.length}]`)
.replace(/\t/g, "[TAB]");
}
/** Insert zero-width spaces to allow line-breaks inside URLs in tables. */
function formatUrlForDisplay(url) {
if (!url || typeof url !== "string") return url;
return url.replace(/\//g, "/\u200b").replace(/&/g, "\u200b&");
}
/** Parse a Wikibase time string into a JS Date (or null on failure). */
function parseWikibaseTime(timeStr) {
if (!timeStr) return null;
const t = timeStr.replace(/^\+/, "").replace(/-00/g, "-01");
const d = new Date(t);
return isNaN(d.getTime()) ? null : d;
}
function validateUrlPattern(pattern) {
try {
compileAnchoredRegex(pattern).test("https://example.com/test");
return { valid: true, error: null };
} catch (e) {
return { valid: false, error: e.message };
}
}
function validatePropertyRegex(pattern) {
try {
new RegExp(pattern, "u").test("1234567890");
return { valid: true, error: null };
} catch (e) {
return { valid: false, error: e.message };
}
}
function sanitizePattern(rawPattern) {
// \- inside a character class [...] is valid JS and means "literal hyphen"
// unambiguously — leave it alone. Only strip \- outside character classes,
// where the backslash is unnecessary. Similarly, \/ is unnecessary outside
// character classes.
let p = "";
let inClass = false;
for (let i = 0; i < rawPattern.length; i++) {
const ch = rawPattern[i];
const next = rawPattern[i + 1];
if (ch === "\\" && !inClass && (next === "-" || next === "/")) {
// Drop the backslash — the character is a literal outside a class
p += next;
i++;
} else {
if (ch === "[" && (i === 0 || rawPattern[i - 1] !== "\\"))
inClass = true;
if (ch === "]" && inClass) inClass = false;
p += ch;
}
}
return p;
}
/** Convert Wikidata-dialect regex syntax to JavaScript. */
function convertWikidataRegexToJS(rawPattern) {
let p = sanitizePattern(rawPattern);
p = p.replace(/\(\?P<([a-zA-Z0-9_]+)>/g, "(?<$1>");
p = p.replace(/\(\?'([a-zA-Z0-9_]+)'/g, "(?<$1>");
p = p.replace(/\\g<([a-zA-Z0-9_]+)>/g, "\\k<$1>");
p = p.replace(/\\g'([a-zA-Z0-9_]+)'/g, "\\k<$1>");
p = p.replace(/\(\?>/g, "("); // atomic groups -> plain groups
return p;
}
const ARCHIVE_DOMAINS = [
"web.archive.org",
"archive.is",
"wayback.archive-it.org",
];
function isArchiveUrl(url) {
if (!url) return false;
try {
const host = new URL(url).hostname.toLowerCase();
return ARCHIVE_DOMAINS.some((d) => host === d || host.endsWith("." + d));
} catch {
return false;
}
}
function analyzeWikimediaUrl(url) {
try {
const { hostname } = new URL(url);
const isWikimediaHost =
/^.+(mediawiki|wik(i(books|data|(m|p)edia|functions|news|quote|source|species|versity|voyage)|tionary)|wmflabs)\.org$/.test(
hostname,
);
return {
isWikimediaHost,
isWikidataItem: isWikimediaHost && hostname.includes("wikidata.org"),
isWikisourceItem:
isWikimediaHost && hostname.includes("wikisource.org"),
};
} catch {
return {
isWikimediaHost: false,
isWikidataItem: false,
isWikisourceItem: false,
};
}
}
function isWikimediaImportUrl(url) {
const { isWikimediaHost, isWikidataItem, isWikisourceItem } =
analyzeWikimediaUrl(url);
return isWikimediaHost && !isWikidataItem && !isWikisourceItem;
}
// ==== 04 In-memory caches =================================================
const occupationParentsCache = new Map(); // QID -> Set<parentQID>
// urlStripCache holds { always: { hostname: [param,...] }, recognition: { hostname: [param,...] } }
// loaded from the wiki page and merged with the hardcoded defaults at runtime.
const urlStripCache = { always: {}, recognition: {} };
const wikipediaEditionsCache = new Map(); // QID -> language code
const wikipediaLangNamesCache = new Map(); // language code -> language name
// urlBlocklistCache holds an array of rule objects parsed from the blocklist page.
// Each rule: { pattern: string, matchType: "prefix"|"regex", action: "remove"|"deprecate",
// sectionLabel: string, compiledRegex?: RegExp }
const urlBlocklistCache = { rules: [], timestamp: 0 };
const obsoleteIdProps = new Set();
const propertyRegexCache = new Map(); // PID -> regex string
const propertyUrlPatternsCache = new Map(); // PID -> [{pattern, replacement}]
const propertyStatedInCache = new Map(); // PID -> {preferred, invalid}
// sourceCategoryCache holds the rules parsed from the wiki page.
// { aggregator: Set<PID>, community: Set<PID>,
// redundant: Array<{ weakPid: string, strongPid: string, strongQid: string|null }> }
const sourceCategoryCache = {
aggregator: new Set(),
community: new Set(),
redundant: [],
};
let indexedDBReady = false;
let indexedDB_instance = null;
// ==== 05 IndexedDB helpers ================================================
async function initIndexedDB() {
return new Promise((resolve, reject) => {
const request = indexedDB.open(INDEXEDDB_NAME, INDEXEDDB_VERSION);
request.onerror = () => {
console.error(`${TOOL_NAME}: failed to open IndexedDB`, request.error);
indexedDBReady = false;
reject(request.error);
};
request.onsuccess = () => {
indexedDB_instance = request.result;
indexedDBReady = true;
resolve(indexedDB_instance);
};
request.onupgradeneeded = (event) => {
const db = event.target.result;
for (const entry of caches) {
if (
entry.indexeddb_name &&
!db.objectStoreNames.contains(entry.indexeddb_name)
) {
db.createObjectStore(entry.indexeddb_name, { keyPath: "key" });
}
}
};
});
}
function getIndexedDBStore(storeName, mode) {
if (!indexedDBReady || !indexedDB_instance) return null;
try {
return indexedDB_instance
.transaction([storeName], mode)
.objectStore(storeName);
} catch (e) {
console.error(`${TOOL_NAME}: failed to open store "${storeName}"`, e);
return null;
}
}
const getIndexedDBReadTx = (n) => getIndexedDBStore(n, "readonly");
const getIndexedDBWriteTx = (n) => getIndexedDBStore(n, "readwrite");
async function cache_saveIndexedDB(entry) {
if (!indexedDBReady || !indexedDB_instance) {
console.warn(
`${TOOL_NAME}: IndexedDB not ready, cannot save ${entry.key}`,
);
return false;
}
const {
key,
target,
type,
valueKind = "plain",
indexeddb_name: storeName,
} = entry;
if (!storeName) return false;
const tx = getIndexedDBWriteTx(storeName);
if (!tx) return false;
let payload;
if (type === "map" && target instanceof Map) {
const entries = Array.from(target.entries()).map(([k, v]) =>
valueKind === "set" ? [k, Array.from(v)] : [k, v],
);
payload = { key, timestamp: Date.now(), kind: "map", valueKind, entries };
} else if (type === "set" && target instanceof Set) {
payload = {
key,
timestamp: Date.now(),
kind: "set",
items: Array.from(target),
};
} else {
console.warn(`${TOOL_NAME}: unsupported cache type for ${key}`);
return false;
}
return new Promise((resolve, reject) => {
const req = tx.put(payload);
req.onerror = () => {
console.warn(`${TOOL_NAME}: save failed for ${key}`, req.error);
reject(req.error);
};
req.onsuccess = () => resolve(true);
});
}
async function cache_loadIndexedDB(entry) {
if (!indexedDBReady || !indexedDB_instance) return false;
const { key, target, ttl, type, indexeddb_name: storeName } = entry;
if (!storeName) return false;
const tx = getIndexedDBReadTx(storeName);
if (!tx) return false;
return new Promise((resolve, reject) => {
const req = tx.get(key);
req.onerror = () => reject(req.error);
req.onsuccess = () => {
const parsed = req.result;
if (!parsed) {
resolve(false);
return;
}
if (parsed.timestamp && Date.now() - parsed.timestamp > ttl) {
cache_resetIndexedDB(entry).catch(() => {});
resolve(false);
return;
}
target.clear();
if (parsed.kind === "map" && target instanceof Map) {
parsed.entries.forEach(([k, v]) =>
target.set(k, parsed.valueKind === "set" ? new Set(v) : v),
);
} else if (parsed.kind === "set" && target instanceof Set) {
parsed.items.forEach((item) => target.add(item));
} else {
console.warn(`${TOOL_NAME}: type mismatch for ${key}`);
resolve(false);
return;
}
resolve(true);
};
});
}
async function cache_resetIndexedDB(entry) {
if (!indexedDBReady || !indexedDB_instance) return false;
const { key, target, indexeddb_name: storeName } = entry;
if (!storeName) return false;
const tx = getIndexedDBWriteTx(storeName);
if (!tx) return false;
target.clear();
return new Promise((resolve, reject) => {
const req = tx.delete(key);
req.onerror = () => {
console.warn(`${TOOL_NAME}: reset failed for ${key}`, req.error);
reject(req.error);
};
req.onsuccess = () => resolve(true);
});
}
async function cache_getStatusIndexedDB(entry) {
if (!indexedDBReady || !indexedDB_instance) {
return `${entry.label} IndexedDB cache is unavailable`;
}
const { key, target, ttl: ttlMs, label, indexeddb_name: storeName } = entry;
if (!storeName) return `${label} IndexedDB cache status unavailable`;
const tx = getIndexedDBReadTx(storeName);
if (!tx) return `${label} IndexedDB cache status unavailable`;
return new Promise((resolve) => {
const req = tx.get(key);
req.onerror = () =>
resolve(`${label} IndexedDB cache status unavailable`);
req.onsuccess = () => {
const parsed = req.result;
const count = target.size;
if (!parsed) {
resolve(
count > 0
? `${label} cache contains ${count} items (in memory, not yet synced to IndexedDB)`
: `${label} IndexedDB cache is empty`,
);
return;
}
const expiry = parsed.timestamp
? new Date(parsed.timestamp + ttlMs).toLocaleDateString()
: "unknown";
resolve(
`${label} IndexedDB cache contains ${count} items, will be reset on ${expiry}`,
);
};
});
}
// ==== 06 localStorage cache helpers =======================================
function cache_saveLocalSt(entry) {
const { key, target, type, valueKind = "plain" } = entry;
try {
let payload;
if (type === "map" && target instanceof Map) {
const entries = Array.from(target.entries()).map(([k, v]) =>
valueKind === "set" ? [k, Array.from(v)] : [k, v],
);
payload = { timestamp: Date.now(), kind: "map", valueKind, entries };
} else if (type === "set" && target instanceof Set) {
payload = {
timestamp: Date.now(),
kind: "set",
items: Array.from(target),
};
} else {
console.warn(`${TOOL_NAME}: unsupported cache type for ${key}`);
return;
}
localStorage.setItem(key, JSON.stringify(payload));
} catch (e) {
console.warn(`${TOOL_NAME}: failed to save ${key}`, e);
}
}
function cache_loadLocalSt(entry) {
const { key, target, ttl, type } = entry;
try {
const raw = localStorage.getItem(key);
if (!raw) return false;
const parsed = JSON.parse(raw);
if (!parsed) return false;
if (parsed.timestamp && Date.now() - parsed.timestamp > ttl) {
localStorage.removeItem(key);
return false;
}
target.clear();
if (parsed.kind === "map" && target instanceof Map) {
parsed.entries.forEach(([k, v]) =>
target.set(k, parsed.valueKind === "set" ? new Set(v) : v),
);
} else if (parsed.kind === "set" && target instanceof Set) {
parsed.items.forEach((item) => target.add(item));
} else {
console.warn(`${TOOL_NAME}: type mismatch for ${key}`);
return false;
}
return true;
} catch (e) {
console.warn(`${TOOL_NAME}: failed to load cache ${key}`, e);
return false;
}
}
function cache_resetLocalSt(entry) {
try {
entry.target.clear();
localStorage.removeItem(entry.key);
} catch (e) {
console.warn(`${TOOL_NAME}: failed to reset cache ${entry.key}`, e);
}
}
function cache_getStatusLocalSt(entry) {
const { key, target, ttl: ttlMs, label } = entry;
const raw = localStorage.getItem(key);
if (!raw) return `${label} cache is empty`;
try {
const parsed = JSON.parse(raw);
const expiry = parsed.timestamp
? new Date(parsed.timestamp + ttlMs).toLocaleDateString()
: "unknown";
return `${label} cache contains ${target.size} items, will be reset on ${expiry}`;
} catch {
return `${label} cache status unavailable`;
}
}
/** Unified status query (routes to localStorage or IndexedDB). */
function cache_getStatus(entry) {
if (entry._customStatus) return Promise.resolve(entry._customStatus());
return entry.indexeddb_name
? cache_getStatusIndexedDB(entry)
: Promise.resolve(cache_getStatusLocalSt(entry));
}
/** Unified reset (routes to localStorage or IndexedDB). */
function cache_reset(entry) {
if (entry._customReset) return Promise.resolve(entry._customReset());
return entry.indexeddb_name
? cache_resetIndexedDB(entry)
: Promise.resolve(cache_resetLocalSt(entry));
}
// ==== 07 Settings =========================================================
const defaultSettings = {
autoStartPreview: true,
enableLargeBuffers: false,
enableHeavyComputing: false,
enabledDetectors: {
wikimedia: true,
aggregator: true,
community: true,
redundant: true,
inferred: true,
normalizeLabels: true,
removeAliasEqualsLabel: true,
removeLowPrecisionDates: true,
removeRedundantPreferred: true,
},
};
function cache_loadSettings() {
try {
return {
...defaultSettings,
...JSON.parse(localStorage.getItem(`${TOOL_NAME}_settings`)),
};
} catch {
return { ...defaultSettings };
}
}
function cache_saveSettings(settings) {
localStorage.setItem(`${TOOL_NAME}_settings`, JSON.stringify(settings));
}
// ==== 08 API helpers ======================================================
/** Fetch Wikipedia edition QID->langcode map from the DB reports page. */
async function api_fetchWikipediaEditions() {
try {
const data = await new mw.Api().get({
action: "parse",
page: "Wikidata:Database_reports/Wikipedia_versions",
prop: "text",
format: "json",
});
const html = data?.parse?.text?.["*"];
if (!html) return new Map();
const doc = new DOMParser().parseFromString(html, "text/html");
const qidToLanguage = new Map();
doc.querySelectorAll("table.wikitable tr").forEach((tr) => {
const cells = tr.querySelectorAll("td");
if (cells.length < 2) return;
const link = cells[1].querySelector("a[href*='/wiki/Q']");
if (!link) return;
const m = link.getAttribute("href").match(/\/wiki\/(Q\d+)/);
if (!m) return;
// cells[3] may contain two values separated by <br> (e.g. "da<br>dk").
// We only want the first (the Wikipedia language code), not the concatenation.
// Take the text of the first child text node; fall back to splitting on whitespace.
const langCell = cells.length >= 4 ? cells[3] : null;
const langCode = langCell
? langCell.firstChild?.nodeType === Node.TEXT_NODE
? langCell.firstChild.textContent.trim()
: langCell.textContent.split(/\s+/)[0] || ""
: "";
qidToLanguage.set(m[1], langCode);
});
console.log(
`${TOOL_NAME}: fetched ${qidToLanguage.size} Wikipedia edition QIDs`,
);
return qidToLanguage;
} catch (e) {
console.error(`${TOOL_NAME}: failed to fetch Wikipedia edition QIDs`, e);
return new Map();
}
}
/** Fetch all Wikidata label/description/alias language codes and names via wbcontentlanguages. */
async function api_fetchContentLanguages() {
try {
const data = await new mw.Api().get({
action: "query",
meta: "wbcontentlanguages",
wbclcontext: "term",
wbclprop: "name",
formatversion: 2,
format: "json",
});
const langs = data?.query?.wbcontentlanguages;
if (!langs) return {};
console.log(
`${TOOL_NAME}: fetched ${Object.keys(langs).length} content languages`,
);
return langs; // { code: { name } }
} catch (e) {
console.error(`${TOOL_NAME}: failed to fetch content languages`, e);
return {};
}
}
/** Fetch multiple entities by QID, chunking at CHUNK_SIZE. */
async function api_fetchEntities(qids) {
const unique = uniq(qids);
if (!unique.length) return {};
const results = {};
for (let i = 0; i < unique.length; i += CHUNK_SIZE) {
const chunk = unique.slice(i, i + CHUNK_SIZE);
try {
const data = await new mw.Api().get({
action: "wbgetentities",
format: "json",
ids: chunk.join("|"),
props: "claims",
});
Object.assign(results, data.entities || {});
} catch (e) {
console.error(`${TOOL_NAME}: wbgetentities failed for chunk`, chunk, e);
}
}
return results;
}
async function api_fetchAllObsoleteIdProps() {
const api = new mw.Api();
const baseParams = {
action: "query",
format: "json",
list: "search",
formatversion: 2,
srsearch: "haswbstatement:P31=Q108951239|P31=Q60457486",
srnamespace: 120,
srlimit: 50,
srprop: "",
};
const allProps = [];
let cont = {};
do {
try {
const data = await api.get({ ...baseParams, ...cont });
if (data.query?.search) {
allProps.push(
...data.query.search.map((r) => r.title.replace("Property:", "")),
);
}
cont = data.continue || {};
} catch (e) {
console.error(`${TOOL_NAME}: api_fetchAllObsoleteIdProps failed`, e);
break;
}
} while (cont.sroffset);
return new Set(allProps);
}
async function api_fetchPropertyRegexConstraints() {
const query = `
SELECT ?property ?regex WHERE {
?property wikibase:propertyType wikibase:ExternalId;
p:P2302 ?constraintStatement.
?constraintStatement ps:P2302 wd:Q21502404;
pq:P1793 ?regex.
}
`;
try {
const resp = await fetch(
"https://query.wikidata.org/sparql?query=" +
encodeURIComponent(query) +
"&format=json",
);
const { results } = await resp.json();
return results.bindings.map((r) => {
const propId = r.property.value.replace(WIKIDATA_ENTITY_BASE_URL, "");
const raw = r.regex.value;
const san = sanitizePattern(raw);
if (raw !== san)
console.warn(
`${TOOL_NAME}: sanitized regex for ${propId}: "${raw}" -> "${san}"`,
);
return [propId, san];
});
} catch (e) {
console.error(
`${TOOL_NAME}: failed to fetch property regex constraints`,
e,
);
return [];
}
}
/** Returns true if this property+pattern combination should be skipped. */
function shouldIgnorePattern(pid, expr) {
if (pid === "P1184") return true;
if (pid === PID_GND_ID) {
return (
expr.includes("deutsche-biographie") || expr.includes("lagis-hessen")
);
}
if (pid === PID_DEUTSCHE_BIOGRAPHIE_GND_ID) {
return !expr.includes("deutsche-biographie");
}
return false;
}
async function api_fetchPropertyUrlMatchPatterns() {
const query = `
SELECT DISTINCT ?property ?expr ?repl WHERE {
?property wikibase:propertyType wikibase:ExternalId;
wdt:P8966 ?expr.
OPTIONAL {
?property p:P8966 ?statement.
?statement ps:P8966 ?expr;
pq:P8967 ?repl.
}
FILTER((STR(?expr)) != "")
}
`;
try {
const resp = await fetch(
"https://query.wikidata.org/sparql?query=" +
encodeURIComponent(query) +
"&format=json",
);
const { results } = await resp.json();
const map = new Map();
for (const r of results.bindings) {
const propId = r.property.value.replace(WIKIDATA_ENTITY_BASE_URL, "");
const raw = r.expr.value;
const san = convertWikidataRegexToJS(raw);
if (raw !== san)
console.warn(
`${TOOL_NAME}: sanitized pattern for ${propId}: "${raw}" -> "${san}"`,
);
if (!map.has(propId)) map.set(propId, []);
map.get(propId).push({ pattern: san, replacement: r.repl?.value });
}
return map;
} catch (e) {
console.error(
`${TOOL_NAME}: failed to fetch property URL match patterns`,
e,
);
return new Map();
}
}
function extractQID(entityUrl) {
return entityUrl.replace(WIKIDATA_ENTITY_BASE_URL, "");
}
/**
* Fetch URL strip rules from [[User:Difool/unrecognized_urls]].
*
* The page must contain a wikitable with three columns:
* Hostname | Mode | Parameters
*
* Hostname conventions (same as the hardcoded maps):
* "example.com" – exact/base-domain match
* ".example.com" – suffix match (covers country-code subdomains)
*
* Mode must be "always" or "recognition".
* Parameters is a comma-separated list of URL query parameter names.
*
* Example table row:
* | twitter.com || always || fbclid, utm_source
*/
async function api_fetchUrlStripRules() {
try {
const data = await new mw.Api().get({
action: "parse",
page: URL_STRIP_PAGE,
prop: "text",
format: "json",
});
const html = data?.parse?.text?.["*"];
if (!html) return { always: {}, recognition: {} };
const doc = new DOMParser().parseFromString(html, "text/html");
const result = { always: {}, recognition: {} };
doc.querySelectorAll("table.wikitable tr").forEach((tr) => {
const cells = tr.querySelectorAll("td");
if (cells.length < 3) return; // skip header rows
const hostname = cells[0].textContent.trim();
const mode = cells[1].textContent.trim().toLowerCase();
const params = cells[2].textContent
.trim()
.split(",")
.map((p) => p.trim())
.filter(Boolean);
if (!hostname || !params.length) return;
if (mode !== "always" && mode !== "recognition") return;
if (!result[mode][hostname]) result[mode][hostname] = [];
result[mode][hostname].push(...params);
});
console.log(
`${TOOL_NAME}: fetched URL strip rules —`,
Object.keys(result.always).length,
"always,",
Object.keys(result.recognition).length,
"recognition",
);
return result;
} catch (e) {
console.error(`${TOOL_NAME}: failed to fetch URL strip rules`, e);
return { always: {}, recognition: {} };
}
}
/**
* Fetch URL deprecation/removal blocklist from [[User:Difool/URL-deprecation-blocklist]].
*
* The page is structured as a series of === level-3 === sections, each containing
* one wikitable. The first column ("Pattern") is the URL pattern string, the
* second column ("Match type") is "prefix" or "regex".
*
* The section title determines the action:
* "URLs to remove" -> action: "remove"
* anything else -> action: "deprecate", sectionLabel = section title text
*
* Returns an array of rule objects:
* { pattern, matchType, action, sectionLabel, compiledRegex? }
*/
async function api_fetchUrlBlocklist() {
try {
const data = await new mw.Api().get({
action: "parse",
page: URL_BLOCKLIST_PAGE,
prop: "text",
format: "json",
});
const html = data?.parse?.text?.["*"];
if (!html) return [];
const doc = new DOMParser().parseFromString(html, "text/html");
const rules = [];
// Walk through h3 headings, paragraphs, and the tables that follow each one.
// The rendered HTML from the wiki parser places <h3>, <p>, and <table> as
// siblings inside the content div.
// Paragraphs between an H3 and its table may contain the deprecation reason,
// e.g. "Deprecation reason: {{Q|Q139894521}}" rendered as a QID link.
let currentSection = null;
let currentAction = "deprecate";
let currentDeprecationReason = null; // QID string or null
for (const el of doc.querySelectorAll("h3, p, table.wikitable")) {
if (el.tagName === "H3") {
currentSection = el.textContent.replace(/\[edit\]/i, "").trim();
currentAction = currentSection
.toLowerCase()
.includes("urls to remove")
? "remove"
: "deprecate";
currentDeprecationReason = null; // reset for each new section
continue;
}
// Paragraph between the heading and the table — look for a QID link that
// represents the P2241 (reason for deprecated rank) value, e.g. the rendered
// form of {{Q|Q139894521}}.
if (el.tagName === "P") {
if (currentSection && !currentDeprecationReason) {
const link = el.querySelector('a[href*="/wiki/Q"]');
if (link) {
const m = link.getAttribute("href").match(/\/wiki\/(Q\d+)/);
if (m) currentDeprecationReason = m[1];
}
}
continue;
}
// It's a table
if (!currentSection) continue;
el.querySelectorAll("tr").forEach((tr) => {
const cells = tr.querySelectorAll("td");
if (cells.length < 2) return; // header row
const pattern = cells[0].textContent.trim();
const matchType = cells[1].textContent.trim().toLowerCase();
if (!pattern) return;
if (matchType !== "prefix" && matchType !== "regex") return;
const rule = {
pattern,
matchType,
action: currentAction,
sectionLabel: currentSection,
deprecationReason: currentDeprecationReason || null,
};
if (matchType === "regex") {
try {
rule.compiledRegex = new RegExp(pattern, "iu");
} catch (e) {
console.warn(
`${TOOL_NAME}: invalid blocklist regex "${pattern}": ${e.message}`,
);
return; // skip broken regex rules
}
}
rules.push(rule);
});
}
console.log(`${TOOL_NAME}: fetched ${rules.length} URL blocklist rules`);
return rules;
} catch (e) {
console.error(`${TOOL_NAME}: failed to fetch URL blocklist`, e);
return [];
}
}
/**
* Fetch P9073 (applicable "stated in") values with their rank, plus related-entity
* QIDs (issuer, maintainer, editor, etc.) that users sometimes wrongly use as
* "stated in" values in references.
*
* Two queries are run in parallel:
* 1. Ranked P9073 values -> builds allowed / preferred
* 2. Related-entity fields on the property and on each stated-in item
* -> any QID not already in allowed goes into notAllowed
*/
async function api_fetchPropertyStatedInPreferences() {
const statedInQuery = `
SELECT ?prop ?stated_in ?rank WHERE {
?prop wikibase:propertyType wikibase:ExternalId;
p:P9073 ?stmt.
?stmt ps:P9073 ?stated_in;
wikibase:rank ?rank.
FILTER(?rank != wikibase:DeprecatedRank)
}
`;
// P2378 issued by, P126 maintained by, P10726 class of, P1629 subject item,
// P98 editor (on the stated-in item itself).
const relatedQuery = `
SELECT DISTINCT ?prop ?related WHERE {
?prop wikibase:propertyType wikibase:ExternalId.
{
?prop wdt:P9073 ?stated_in.
{ ?stated_in wdt:P98 ?related. }
} UNION {
{ ?prop wdt:P2378 ?related. }
UNION { ?prop wdt:P126 ?related. }
UNION { ?prop wdt:P10726 ?related. }
UNION { ?prop wdt:P1629 ?related. }
}
}
`;
try {
const [statedInResp, relatedResp] = await Promise.all([
fetch(
"https://query.wikidata.org/sparql?query=" +
encodeURIComponent(statedInQuery) +
"&format=json",
),
fetch(
"https://query.wikidata.org/sparql?query=" +
encodeURIComponent(relatedQuery) +
"&format=json",
),
]);
const statedInRows = (await statedInResp.json()).results.bindings;
const relatedRows = (await relatedResp.json()).results.bindings;
// Pass 1: collect P9073 values grouped by rank
const byProp = new Map(); // propId -> { preferred: QID[], normal: QID[] }
for (const row of statedInRows) {
const propId = extractQID(row.prop.value);
const statedInId = extractQID(row.stated_in.value);
const isPreferred = row.rank.value.endsWith("PreferredRank");
if (!byProp.has(propId))
byProp.set(propId, { preferred: [], normal: [] });
const bucket = byProp.get(propId);
if (isPreferred) bucket.preferred.push(statedInId);
else bucket.normal.push(statedInId);
}
// Pass 2: collect related QIDs per property
const relatedByProp = new Map(); // propId -> Set<QID>
for (const row of relatedRows) {
const propId = extractQID(row.prop.value);
const relatedId = extractQID(row.related.value);
if (!relatedByProp.has(propId)) relatedByProp.set(propId, new Set());
relatedByProp.get(propId).add(relatedId);
}
// Build final map: propId -> { preferred: QID, allowed: Set<QID>, notAllowed: Set<QID> }
//
// allowed rules (same as before):
// - if any preferred-rank P9073 values exist:
// preferred = first preferred-rank value
// allowed = all preferred-rank + all normal-rank values
// - otherwise:
// preferred = first normal-rank value
// allowed = all normal-rank values
//
// notAllowed: related QIDs (P2378/P126/P10726/P1629/P98) that are not in allowed.
// These are QIDs users sometimes wrongly use as "stated in" but which are not
// valid P9073 values for the property.
const map = new Map();
const allPropIds = new Set([...byProp.keys(), ...relatedByProp.keys()]);
for (const propId of allPropIds) {
const { preferred: pref = [], normal = [] } = byProp.get(propId) || {};
const hasPref = pref.length > 0;
const preferred = hasPref ? pref[0] : (normal[0] ?? null);
const allowed = new Set(hasPref ? [...pref, ...normal] : normal);
const notAllowed = new Set();
for (const qid of relatedByProp.get(propId) || []) {
if (!allowed.has(qid)) notAllowed.add(qid);
}
if (preferred || notAllowed.size) {
map.set(propId, {
preferred: preferred ?? null,
allowed,
notAllowed,
});
}
}
return map;
} catch (e) {
console.error(
`${TOOL_NAME}: failed to fetch property stated-in preferences`,
e,
);
return new Map();
}
}
/**
* Fetch source-category rules from [[User:Difool/reference-source-categories]].
*
* Parses three wikitable sections identified by their === heading ===:
* "Aggregator sources" -> category key "aggregator"
* "Community sources" -> category key "community"
* "Redundant sources" -> category key "redundant"
*
* Aggregator / community tables:
* Column 0: {{P|Pxx}} or bare Pxx -> PID
* Column 1: Notes (ignored)
*
* Redundant table:
* Column 0: Weak property {{P|Pxx}}
* Column 1: Strong property {{P|Pxx}}
* Column 2: Strong stated-in QID {{Q|Qxx}} or bare Qxx (optional)
* Column 3: Notes (ignored)
*
* Returns { aggregator: Set<PID>, community: Set<PID>,
* redundant: [{weakPid, strongPid, strongQid}] }
*/
async function api_fetchSourceCategoryRules() {
const empty = {
aggregator: new Set(),
community: new Set(),
redundant: [],
};
try {
const data = await new mw.Api().get({
action: "parse",
page: SOURCE_CATEGORIES_PAGE,
prop: "text",
format: "json",
});
const html = data?.parse?.text?.["*"];
if (!html) return empty;
const doc = new DOMParser().parseFromString(html, "text/html");
const result = {
aggregator: new Set(),
community: new Set(),
redundant: [],
};
// Extract a PID string from a table cell.
// Handles both {{P|Pxx}} rendered as a link with title "Property:Pxx"
// and plain text like "P214".
function extractPid(cell) {
const link = cell.querySelector('a[href*="Property:P"]');
if (link) {
const m = link.getAttribute("href").match(/Property:(P\d+)/);
if (m) return m[1];
const title = link.getAttribute("title");
if (title?.startsWith("Property:"))
return title.replace("Property:", "");
}
const text = cell.textContent.trim();
return /^P\d+$/.test(text) ? text : null;
}
let currentSection = null;
for (const el of doc.querySelectorAll("h3, table.wikitable")) {
if (el.tagName === "H3") {
const heading = el.textContent
.replace(/\[edit\]/i, "")
.trim()
.toLowerCase();
if (heading.includes("aggregator")) currentSection = "aggregator";
else if (heading.includes("community")) currentSection = "community";
else if (heading.includes("redundant")) currentSection = "redundant";
else currentSection = null;
continue;
}
// It's a table
if (!currentSection) continue;
el.querySelectorAll("tr").forEach((tr) => {
const cells = tr.querySelectorAll("td");
if (!cells.length) return; // header row
if (
currentSection === "aggregator" ||
currentSection === "community"
) {
const pid = extractPid(cells[0]);
if (pid) result[currentSection].add(pid);
} else if (currentSection === "redundant") {
if (cells.length < 2) return;
const weakPid = extractPid(cells[0]);
const strongPid = extractPid(cells[1]);
if (!weakPid || !strongPid) return;
result.redundant.push({ weakPid, strongPid });
}
});
}
console.log(
`${TOOL_NAME}: fetched source category rules —`,
`${result.aggregator.size} aggregator,`,
`${result.community.size} community,`,
`${result.redundant.length} redundant`,
);
return result;
} catch (e) {
console.error(`${TOOL_NAME}: failed to fetch source category rules`, e);
return empty;
}
}
// ==== 09 Cache registry ===================================================
const caches = [
{
key: OCC_CACHE_KEY,
label: "Occupations",
ttl: OCC_CACHE_TTL_MS,
target: occupationParentsCache,
type: "map",
valueKind: "set",
fetchFn: null,
assignFn(val) {
occupationParentsCache.clear();
for (const [qid, parents] of val)
occupationParentsCache.set(qid, parents);
},
},
{
key: WIKIPEDIA_EDITIONS_CACHE_KEY,
label: "Wikipedia editions",
ttl: WIKIPEDIA_EDITIONS_TTL_MS,
target: wikipediaEditionsCache,
type: "map",
valueKind: "plain",
fetchFn: api_fetchWikipediaEditions,
assignFn(val) {
wikipediaEditionsCache.clear();
for (const [qid, code] of val) wikipediaEditionsCache.set(qid, code);
},
},
{
key: LANG_NAMES_CACHE_KEY,
label: "Language names",
ttl: LANG_NAMES_TTL_MS,
target: null,
type: "map",
valueKind: "plain",
fetchFn: api_fetchContentLanguages,
assignFn(val) {
wikipediaLangNamesCache.clear();
// val is a plain object { code: { name } } (fresh fetch or JSON round-trip)
for (const [code, entry] of Object.entries(val || {})) {
if (entry?.name) wikipediaLangNamesCache.set(code, entry.name);
}
},
_customSave() {
try {
localStorage.setItem(
LANG_NAMES_CACHE_KEY,
JSON.stringify({
timestamp: Date.now(),
// Store as flat code->name object; wrap back to { name } on load
// so assignFn can handle both fresh-fetch and round-trip shapes.
langs: Object.fromEntries(
[...wikipediaLangNamesCache.entries()].map(([code, name]) => [
code,
{ name },
]),
),
}),
);
} catch (e) {
console.warn(`${TOOL_NAME}: failed to save language names`, e);
}
},
_customLoad() {
try {
const raw = localStorage.getItem(LANG_NAMES_CACHE_KEY);
if (!raw) return false;
const parsed = JSON.parse(raw);
if (!parsed?.langs) return false;
if (
parsed.timestamp &&
Date.now() - parsed.timestamp > LANG_NAMES_TTL_MS
) {
localStorage.removeItem(LANG_NAMES_CACHE_KEY);
return false;
}
wikipediaLangNamesCache.clear();
for (const [code, entry] of Object.entries(parsed.langs)) {
if (entry?.name) wikipediaLangNamesCache.set(code, entry.name);
}
return true;
} catch (e) {
console.warn(`${TOOL_NAME}: failed to load language names`, e);
return false;
}
},
_customReset() {
wikipediaLangNamesCache.clear();
localStorage.removeItem(LANG_NAMES_CACHE_KEY);
},
_customStatus() {
const raw = localStorage.getItem(LANG_NAMES_CACHE_KEY);
if (!raw) return "Language names cache is empty";
try {
const parsed = JSON.parse(raw);
const expiry = parsed.timestamp
? new Date(
parsed.timestamp + LANG_NAMES_TTL_MS,
).toLocaleDateString()
: "unknown";
return `Language names cache contains ${wikipediaLangNamesCache.size} entries, will be reset on ${expiry}`;
} catch {
return "Language names cache status unavailable";
}
},
},
{
key: OBSOLETE_IDS_CACHE_KEY,
label: "Obsolete ID properties",
ttl: OBSOLETE_IDS_TTL_MS,
target: obsoleteIdProps,
type: "set",
fetchFn: api_fetchAllObsoleteIdProps,
assignFn(val) {
obsoleteIdProps.clear();
for (const item of val) obsoleteIdProps.add(item);
},
},
{
key: REGEX_CACHE_KEY,
label: "Property regex constraints",
ttl: REGEX_CACHE_TTL_MS,
target: propertyRegexCache,
type: "map",
valueKind: "plain",
fetchFn: api_fetchPropertyRegexConstraints,
indexeddb_name: "propertyRegex",
assignFn(val) {
propertyRegexCache.clear();
let skipped = 0;
for (const [prop, regex] of val) {
const { valid, error } = validatePropertyRegex(regex);
if (valid) {
propertyRegexCache.set(prop, regex);
} else {
console.warn(
`${TOOL_NAME}: invalid regex for ${prop}: "${regex}" — ${error}`,
);
skipped++;
}
}
if (skipped)
console.log(
`${TOOL_NAME}: skipped ${skipped} invalid property regex constraints`,
);
},
},
{
key: URL_PATTERNS_CACHE_KEY,
label: "Property URL match patterns",
ttl: URL_PATTERNS_TTL_MS,
target: propertyUrlPatternsCache,
type: "map",
valueKind: "array",
fetchFn: api_fetchPropertyUrlMatchPatterns,
indexeddb_name: "propertyUrlPatterns",
assignFn(val) {
propertyUrlPatternsCache.clear();
let skipped = 0;
for (const [prop, patternObjs] of val) {
const valid = [];
for (const pObj of patternObjs) {
const expr = pObj.pattern || pObj;
if (shouldIgnorePattern(prop, expr)) {
console.warn(
`${TOOL_NAME}: ignored pattern for ${prop}: "${expr}"`,
);
skipped++;
continue;
}
const result = validateUrlPattern(expr);
if (result.valid) {
valid.push(pObj);
} else {
console.warn(
`${TOOL_NAME}: invalid URL pattern for ${prop}: "${expr}" — ${result.error}`,
);
skipped++;
}
}
if (valid.length) propertyUrlPatternsCache.set(prop, valid);
}
if (skipped)
console.log(
`${TOOL_NAME}: skipped ${skipped} invalid/ignored URL patterns`,
);
},
},
{
key: URL_STRIP_CACHE_KEY,
label: "URL strip rules",
ttl: URL_STRIP_CACHE_TTL_MS,
target: null,
type: "map",
valueKind: "plain",
fetchFn: api_fetchUrlStripRules,
assignFn(val) {
// After a fresh fetch val is the raw { always, recognition } object.
// After a localStorage round-trip it arrives as-is from JSON.parse.
const rules = val instanceof Map ? val.get("rules") : val;
if (!rules || typeof rules !== "object") return;
urlStripCache.always = rules.always || {};
urlStripCache.recognition = rules.recognition || {};
},
// Custom persistence: store as a single JSON blob (not a Map/Set).
_customSave() {
try {
localStorage.setItem(
URL_STRIP_CACHE_KEY,
JSON.stringify({
timestamp: Date.now(),
rules: {
always: urlStripCache.always,
recognition: urlStripCache.recognition,
},
}),
);
} catch (e) {
console.warn(`${TOOL_NAME}: failed to save URL strip rules`, e);
}
},
_customLoad() {
try {
const raw = localStorage.getItem(URL_STRIP_CACHE_KEY);
if (!raw) return false;
const parsed = JSON.parse(raw);
if (!parsed?.rules) return false;
if (
parsed.timestamp &&
Date.now() - parsed.timestamp > URL_STRIP_CACHE_TTL_MS
) {
localStorage.removeItem(URL_STRIP_CACHE_KEY);
return false;
}
urlStripCache.always = parsed.rules.always || {};
urlStripCache.recognition = parsed.rules.recognition || {};
return true;
} catch (e) {
console.warn(`${TOOL_NAME}: failed to load URL strip rules`, e);
return false;
}
},
_customReset() {
urlStripCache.always = {};
urlStripCache.recognition = {};
localStorage.removeItem(URL_STRIP_CACHE_KEY);
},
_customStatus() {
const raw = localStorage.getItem(URL_STRIP_CACHE_KEY);
if (!raw) return "URL strip rules cache is empty";
try {
const parsed = JSON.parse(raw);
const expiry = parsed.timestamp
? new Date(
parsed.timestamp + URL_STRIP_CACHE_TTL_MS,
).toLocaleDateString()
: "unknown";
const n =
Object.keys(urlStripCache.always).length +
Object.keys(urlStripCache.recognition).length;
const wildcardNote =
urlStripCache.always["*"] || urlStripCache.recognition["*"]
? " (incl. global wildcard)"
: "";
return `URL strip rules cache contains ${n} entries${wildcardNote}, will be reset on ${expiry}`;
} catch {
return "URL strip rules cache status unavailable";
}
},
},
{
key: STATED_IN_CACHE_KEY,
label: "Property 'stated in' preferences",
ttl: STATED_IN_TTL_MS,
target: propertyStatedInCache,
type: "map",
valueKind: "plain",
fetchFn: api_fetchPropertyStatedInPreferences,
indexeddb_name: "propertyStatedIn",
assignFn(val) {
propertyStatedInCache.clear();
for (const [prop, data] of val) {
// After JSON round-trip Sets are serialised as arrays; restore them.
const allowed =
data.allowed instanceof Set
? data.allowed
: new Set(data.allowed || []);
const notAllowed =
data.notAllowed instanceof Set
? data.notAllowed
: new Set(data.notAllowed || []);
propertyStatedInCache.set(prop, {
preferred: data.preferred,
allowed,
notAllowed,
});
}
},
},
{
key: URL_BLOCKLIST_CACHE_KEY,
label: "URL deprecation blocklist",
ttl: URL_BLOCKLIST_CACHE_TTL_MS,
target: null,
type: "map", // placeholder — custom persistence used
valueKind: "plain",
fetchFn: api_fetchUrlBlocklist,
assignFn(val) {
// val may be a raw array (fresh fetch) or the parsed JSON (localStorage round-trip)
const rules = Array.isArray(val)
? val
: val instanceof Map
? val.get("rules")
: null;
if (!Array.isArray(rules)) return;
// Re-compile regex rules after JSON round-trip (RegExp is not JSON-serialisable)
urlBlocklistCache.rules = rules.map((r) => {
if (r.matchType === "regex" && !r.compiledRegex) {
try {
r.compiledRegex = new RegExp(r.pattern, "iu");
} catch {}
}
return r;
});
},
_customSave() {
try {
// Serialise without compiledRegex (not JSON-safe)
const serialisable = urlBlocklistCache.rules.map(
({ compiledRegex: _cr, ...rest }) => rest,
);
localStorage.setItem(
URL_BLOCKLIST_CACHE_KEY,
JSON.stringify({ timestamp: Date.now(), rules: serialisable }),
);
} catch (e) {
console.warn(`${TOOL_NAME}: failed to save URL blocklist`, e);
}
},
_customLoad() {
try {
const raw = localStorage.getItem(URL_BLOCKLIST_CACHE_KEY);
if (!raw) return false;
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed?.rules)) return false;
if (
parsed.timestamp &&
Date.now() - parsed.timestamp > URL_BLOCKLIST_CACHE_TTL_MS
) {
localStorage.removeItem(URL_BLOCKLIST_CACHE_KEY);
return false;
}
urlBlocklistCache.rules = parsed.rules.map((r) => {
if (r.matchType === "regex") {
try {
r.compiledRegex = new RegExp(r.pattern, "iu");
} catch {}
}
return r;
});
return true;
} catch (e) {
console.warn(`${TOOL_NAME}: failed to load URL blocklist`, e);
return false;
}
},
_customReset() {
urlBlocklistCache.rules = [];
localStorage.removeItem(URL_BLOCKLIST_CACHE_KEY);
},
_customStatus() {
const raw = localStorage.getItem(URL_BLOCKLIST_CACHE_KEY);
if (!raw) return "URL blocklist cache is empty";
try {
const parsed = JSON.parse(raw);
const expiry = parsed.timestamp
? new Date(
parsed.timestamp + URL_BLOCKLIST_CACHE_TTL_MS,
).toLocaleDateString()
: "unknown";
return `URL blocklist cache contains ${urlBlocklistCache.rules.length} rules, will be reset on ${expiry}`;
} catch {
return "URL blocklist cache status unavailable";
}
},
},
{
key: SOURCE_CATEGORIES_CACHE_KEY,
label: "Source category rules",
ttl: SOURCE_CATEGORIES_TTL_MS,
target: null,
type: "map",
valueKind: "plain",
fetchFn: api_fetchSourceCategoryRules,
assignFn(val) {
sourceCategoryCache.aggregator.clear();
sourceCategoryCache.community.clear();
sourceCategoryCache.redundant = [];
// val arrives as a plain object after JSON round-trip
const src = val instanceof Map ? Object.fromEntries(val) : val;
for (const pid of src?.aggregator || [])
sourceCategoryCache.aggregator.add(pid);
for (const pid of src?.community || [])
sourceCategoryCache.community.add(pid);
sourceCategoryCache.redundant = Array.isArray(src?.redundant)
? src.redundant.slice()
: [];
},
_customSave() {
try {
localStorage.setItem(
SOURCE_CATEGORIES_CACHE_KEY,
JSON.stringify({
timestamp: Date.now(),
aggregator: Array.from(sourceCategoryCache.aggregator),
community: Array.from(sourceCategoryCache.community),
redundant: sourceCategoryCache.redundant,
}),
);
} catch (e) {
console.warn(`${TOOL_NAME}: failed to save source category rules`, e);
}
},
_customLoad() {
try {
const raw = localStorage.getItem(SOURCE_CATEGORIES_CACHE_KEY);
if (!raw) return false;
const parsed = JSON.parse(raw);
if (!parsed?.aggregator) return false;
if (
parsed.timestamp &&
Date.now() - parsed.timestamp > SOURCE_CATEGORIES_TTL_MS
) {
localStorage.removeItem(SOURCE_CATEGORIES_CACHE_KEY);
return false;
}
sourceCategoryCache.aggregator = new Set(parsed.aggregator || []);
sourceCategoryCache.community = new Set(parsed.community || []);
sourceCategoryCache.redundant = parsed.redundant || [];
return true;
} catch (e) {
console.warn(`${TOOL_NAME}: failed to load source category rules`, e);
return false;
}
},
_customReset() {
sourceCategoryCache.aggregator.clear();
sourceCategoryCache.community.clear();
sourceCategoryCache.redundant = [];
localStorage.removeItem(SOURCE_CATEGORIES_CACHE_KEY);
},
_customStatus() {
const raw = localStorage.getItem(SOURCE_CATEGORIES_CACHE_KEY);
if (!raw) return "Source category rules cache is empty";
try {
const parsed = JSON.parse(raw);
const expiry = parsed.timestamp
? new Date(
parsed.timestamp + SOURCE_CATEGORIES_TTL_MS,
).toLocaleDateString()
: "unknown";
return (
`Source category rules cache: ` +
`${sourceCategoryCache.aggregator.size} aggregator, ` +
`${sourceCategoryCache.community.size} community, ` +
`${sourceCategoryCache.redundant.length} redundant — ` +
`will be reset on ${expiry}`
);
} catch {
return "Source category rules cache status unavailable";
}
},
},
];
// ==== 10 Fetch-failure backoff =============================================
function loadFetchFailureTimes() {
try {
const stored = localStorage.getItem(FETCH_FAILURE_TIMES_KEY);
if (!stored) return {};
const parsed = JSON.parse(stored);
const now = Date.now();
// prune expired entries
return Object.fromEntries(
Object.entries(parsed).filter(([, t]) => now - t < FETCH_BACKOFF_MS),
);
} catch {
return {};
}
}
function saveFetchFailureTimes(times) {
try {
localStorage.setItem(FETCH_FAILURE_TIMES_KEY, JSON.stringify(times));
} catch (e) {
console.warn(`${TOOL_NAME}: failed to save fetch failure times`, e);
}
}
const fetchFailureTimes = loadFetchFailureTimes();
async function refreshCacheWithNotify(entry, settings) {
const { key, label, type, fetchFn, assignFn } = entry;
const tag = key;
if (entry.indexeddb_name && !settings?.enableLargeBuffers) {
console.warn(`${TOOL_NAME}: skipping ${label} — requires large buffers`);
return;
}
const lastFail = fetchFailureTimes[key];
if (lastFail) {
const elapsed = Date.now() - lastFail;
const remaining = FETCH_BACKOFF_MS - elapsed;
if (remaining > 0) {
const retryIn = Math.ceil(remaining / 60000);
console.warn(
`${TOOL_NAME}: skipping ${label} — retrying in ~${retryIn} min`,
);
mw.notify(
`Skipping ${label}: will retry later (last attempt failed recently)`,
{
type: "warn",
autoHide: true,
tag,
},
);
return;
}
}
mw.notify(`Loading ${label}...`, { type: "info", autoHide: false, tag });
try {
const data = await fetchFn();
// Custom entries (e.g. urlStripCache) return a plain object rather than
// an iterable, so skip the Map/Set wrapper and pass the data directly.
const container = entry._customSave
? data
: type === "set"
? new Set(data)
: new Map(data);
assignFn(container);
if (entry._customSave) {
entry._customSave();
} else if (entry.indexeddb_name) {
await cache_saveIndexedDB(entry);
} else {
cache_saveLocalSt(entry);
}
delete fetchFailureTimes[key];
saveFetchFailureTimes(fetchFailureTimes);
const count = entry._customSave
? Object.keys(urlStripCache.always).length +
Object.keys(urlStripCache.recognition).length
: container.size;
mw.notify(
count > 0
? `Reloaded ${label} (${count} entries)`
: `Failed to reload ${label}: empty result`,
{ type: count > 0 ? "success" : "error", autoHide: true, tag },
);
} catch (err) {
console.error(`${TOOL_NAME}: failed to reload ${label}`, err);
fetchFailureTimes[key] = Date.now();
saveFetchFailureTimes(fetchFailureTimes);
mw.notify(`Failed to reload ${label}: ${err.message}`, {
type: "error",
autoHide: true,
tag,
});
}
}
// ==== 11 Cache init =======================================================
async function initCaches(settings) {
// Wipe IndexedDB-backed caches when large buffers are disabled
if (!settings.enableLargeBuffers) {
for (const entry of caches) {
if (entry.indexeddb_name) {
await cache_resetIndexedDB(entry).catch(() => {});
}
}
}
try {
await initIndexedDB();
} catch {
console.warn(
`${TOOL_NAME}: IndexedDB unavailable, falling back to localStorage`,
);
indexedDBReady = false;
}
for (const entry of caches) {
// Skip IndexedDB-only entries when large buffers are off
if (entry.indexeddb_name && !settings.enableLargeBuffers) continue;
let loaded = false;
try {
if (entry._customLoad) {
loaded = entry._customLoad();
} else {
loaded = entry.indexeddb_name
? await cache_loadIndexedDB(entry)
: cache_loadLocalSt(entry);
}
} catch (e) {
console.warn(`${TOOL_NAME}: failed to load ${entry.key}`, e);
}
if (!loaded && entry.fetchFn) {
await refreshCacheWithNotify(entry, settings);
}
}
}
// ==== 12 Claim helpers ====================================================
function determineSourceCategory(entity, ref, allRefs, claim = null) {
const snaks = ref.snaks || {};
const pids = Object.keys(snaks);
// References that only carry metadata are not real sources
if (
pids.length > 0 &&
pids.every((pid) =>
[PID_RETRIEVED, PID_TITLE, PID_SUBJECT_NAMED_AS].includes(pid),
)
) {
return "ignore";
}
const hasRetrieved = pids.includes(PID_RETRIEVED);
// Validate external IDs in non-retrieved references
if (!hasRetrieved) {
for (const pid of pids) {
for (const claim of snaks[pid] || []) {
if (claim.datatype !== "external-id") continue;
const validation = validateExternalIdValue(
pid,
claim.datavalue?.value,
);
if (!validation.valid) {
return validation.reason === "regex_error"
? "external-id:error"
: "invalid";
}
}
}
}
const statedInQIDs = (snaks[PID_STATED_IN] || [])
.map((c) => c.datavalue?.value?.["numeric-id"])
.filter((n) => typeof n === "number")
.map((n) => "Q" + n);
// Tautological "stated in": the reference consists solely of P248 whose
// value is an allowed stated-in for the claim's own external-id property,
// plus at most P1476 (title) / P1810 (subject named as) metadata — and
// crucially no P813 (retrieved). Such a reference adds no information
// beyond what the external-id property itself already implies.
// Example: ISNI claim (P213) with a reference that only has
// stated in (P248) = International Standard Name Identifier (Q423048).
if (
!hasRetrieved &&
claim?.mainsnak?.datatype === "external-id" &&
pids.includes(PID_STATED_IN) &&
pids.every(
(p) =>
p === PID_STATED_IN || p === PID_TITLE || p === PID_SUBJECT_NAMED_AS,
)
) {
const claimPid = claim.mainsnak.property;
const prefs = propertyStatedInCache.get(claimPid);
if (
prefs?.allowed?.size &&
statedInQIDs.some((q) => prefs.allowed.has(q))
) {
return "self_stated_in";
}
}
const wikimediaAllowed = new Set([
PID_INFERRED,
PID_WIKIMEDIA_IMPORT_URL,
PID_IMPORTED_FROM,
PID_RETRIEVED,
]);
// Wikimedia imports
if (
pids.includes(PID_IMPORTED_FROM) ||
pids.includes(PID_WIKIMEDIA_IMPORT_URL)
) {
if (
DEV_WIKIMEDIA_NO_SITELINKS &&
analyzeWikimediaReference(entity, ref).hasMissingSitelink
)
return "wikimedia_no_sitelinks";
if (
pids.includes(PID_INFERRED) &&
pids.every((p) => wikimediaAllowed.has(p))
)
return "wikimedia";
if (
pids.includes(PID_DETERMINATION_METHOD) ||
pids.includes(PID_INFERRED)
)
return "wikimedia+";
return "wikimedia";
}
if (pids.includes(PID_MATCHED_BY_IDENTIFIER_FROM)) return "inferred+";
if (
(pids.includes(PID_INFERRED) || pids.includes(PID_BASED_ON_HEURISTIC)) &&
pids.length === 1
) {
return "inferred";
}
// Aggregator check: direct snak PID match, or P248 stated-in QID match
// (QIDs derived at runtime from propertyStatedInCache via the aggregator PID).
if (sourceCategoryCache.aggregator.size) {
for (const aggPid of sourceCategoryCache.aggregator) {
if (pids.includes(aggPid)) return "aggregator";
const aggQids = propertyStatedInCache.get(aggPid)?.allowed;
if (aggQids && statedInQIDs.some((q) => aggQids.has(q)))
return "aggregator";
}
}
// Community check: same pattern.
if (sourceCategoryCache.community.size) {
for (const comPid of sourceCategoryCache.community) {
if (pids.includes(comPid)) return "community";
const comQids = propertyStatedInCache.get(comPid)?.allowed;
if (comQids && statedInQIDs.some((q) => comQids.has(q)))
return "community";
}
}
// Redundant check: weak PID/QID present in this ref, strong PID/QID present
// in at least one *other* ref on the same statement.
for (const { weakPid, strongPid } of sourceCategoryCache.redundant) {
const weakQids = propertyStatedInCache.get(weakPid)?.allowed;
const isWeak =
pids.includes(weakPid) ||
(weakQids && statedInQIDs.some((q) => weakQids.has(q)));
if (!isWeak) continue;
const strongPresent = allRefs.some((r) => {
if (r === ref) return false;
const rPids = Object.keys(r.snaks || {});
if (rPids.includes(strongPid)) return true;
const strongQids = propertyStatedInCache.get(strongPid)?.allowed;
if (strongQids) {
const rStatedIn = (r.snaks[PID_STATED_IN] || []).map(
(c) => "Q" + c.datavalue?.value?.["numeric-id"],
);
if (rStatedIn.some((q) => strongQids.has(q))) return true;
}
return false;
});
if (strongPresent) return "redundant";
}
// Obsolete external-id check (skip if there is a P813)
if (!hasRetrieved) {
const obsoletePids = pids.filter(
(pid) =>
obsoleteIdProps.has(pid) &&
(snaks[pid] || []).some(
(c) => c.datavalue?.value && c.datatype === "external-id",
),
);
if (obsoletePids.length) {
const hasOtherExtId = pids.some(
(pid) =>
!obsoletePids.includes(pid) &&
(snaks[pid] || []).some((c) => c.datatype === "external-id"),
);
if (!hasOtherExtId) return "obsolete";
}
}
return null;
}
/** Returns true when the statement's mainsnak datatype is not external-id,
* i.e. it carries a 'real' value that warrants stricter sourcing checks.
*/
function isStrictStatement(claim) {
return claim.mainsnak?.datatype !== "external-id";
}
/**
* Returns a numeric quality level for a single reference:
* 2 – genuine source (used to decide whether a claim is well-sourced)
* 1 – weak: aggregator, community, inferred, invalid, obsolete, or redundant
* 0 – wikimedia import / metadata-only / ignored
*/
function getReferenceLevel(entity, ref, allRefs, claim = null) {
const cat = determineSourceCategory(entity, ref, allRefs, claim);
switch (cat) {
case "wikimedia_no_sitelinks":
case "wikimedia":
case "ignore":
case "self_stated_in":
return 0;
case "aggregator":
case "community":
case "inferred":
case "invalid":
case "obsolete":
case "redundant":
return 1;
default:
return 2;
}
}
function isWeakReference(entity, ref, allRefs, claim = null) {
return getReferenceLevel(entity, ref, allRefs, claim) < 2;
}
function hasValidReference(entity, claim) {
const refs = claim.references || [];
return (
refs.length > 0 &&
refs.some((ref) => getReferenceLevel(entity, ref, refs, claim) === 2)
);
}
function extractP279Parents(entity) {
const parents = new Set();
for (const claim of entity?.claims?.[PID_SUBCLASS_OF] || []) {
const id = claim?.mainsnak?.datavalue?.value?.id;
if (isQid(id)) parents.add(id);
}
return parents;
}
/** Return all PIDs that have at least one time-datatype claim. */
function getDateProperties(entity) {
const result = [];
for (const pid in entity.claims) {
if (entity.claims[pid].some((c) => c?.mainsnak?.datatype === "time")) {
result.push(pid);
}
}
return result;
}
/** Normalize a WbTime value object to the given precision level. */
function normalizeDateValue(val, precision) {
if (!val) return null;
const match = val.time.match(/^([+-]\d+)-/);
const year = match ? parseInt(match[1], 10) : null;
if (year === null) return null;
const fmtYear = (y) => (y >= 0 ? `+${y}` : `${y}`);
let ny = year;
if (precision <= 5) {
const pow = Math.pow(10, 9 - precision);
ny = Math.round(year / pow) * pow;
} else if (precision === 6) {
const yf = year / 1000;
ny = (yf < 0 ? Math.floor(yf) : Math.ceil(yf)) * 1000;
} else if (precision === 7) {
const yf = year / 100;
ny = (yf < 0 ? Math.floor(yf) : Math.ceil(yf)) * 100;
} else if (precision === 8) {
ny = Math.trunc(year / 10) * 10;
}
// precision 9 (year): ny = year unchanged
// Downgrade precision when month/day are "00"
const parts = val.time.split("-");
if (
precision === 11 &&
(parts[1] === "00" || parts[2].slice(0, 2) === "00")
)
precision = 9;
if (precision === 10 && parts[1] === "00") precision = 9;
if (precision === 11) {
return {
time: `${fmtYear(ny)}-${parts[1]}-${parts[2].slice(0, 2)}`,
precision,
calendarmodel: val.calendarmodel,
};
}
if (precision === 10) {
return {
time: `${fmtYear(ny)}-${parts[1]}`,
precision,
calendarmodel: val.calendarmodel,
};
}
return {
time: `${fmtYear(ny)}-01-01T00:00:00Z`,
precision,
calendarmodel: val.calendarmodel,
};
}
/**
* Compare two date claims at the lowest common precision.
* E.g. "1955" == "2 Mar 1955", "1590s" == "1591".
*/
function has_same_normalized_date(
c1,
c2,
at_lowest_precision,
ignore_calendarmodel,
) {
const v1 = c1.mainsnak?.datavalue?.value;
const v2 = c2.mainsnak?.datavalue?.value;
if (!v1 && !v2) return true;
if (!v1 || !v2) return false;
let n1 = normalizeDateValue(v1, v1.precision);
let n2 = normalizeDateValue(v2, v2.precision);
if (at_lowest_precision) {
const lowPrec = Math.min(v1.precision, v2.precision);
if (
lowPrec <= 9 ||
v1.precision !== lowPrec ||
v2.precision !== lowPrec
) {
n1 = {
...normalizeDateValue(v1, lowPrec),
calendarmodel: normalizeDateValue(v2, lowPrec).calendarmodel,
};
n2 = normalizeDateValue(v2, lowPrec);
} else {
n1 = normalizeDateValue(v1, lowPrec);
n2 = normalizeDateValue(v2, lowPrec);
}
}
return (
n1.time === n2.time &&
n1.precision === n2.precision &&
(ignore_calendarmodel || n1.calendarmodel === n2.calendarmodel)
);
}
const TOLERANT_QUALIFIER = { pid: "P31", value: "Q26961029" };
function qualifiersEqualExceptP31(claimA, claimB) {
const qa = { ...claimA.qualifiers };
const qb = { ...claimB.qualifiers };
delete qa[TOLERANT_QUALIFIER.pid];
delete qb[TOLERANT_QUALIFIER.pid];
return JSON.stringify(qa) === JSON.stringify(qb);
}
function normalizeDataValueKey(datavalue) {
return JSON.stringify(datavalue || {});
}
function hasNoQualifiers(claim) {
const q = claim.qualifiers || {};
const pids = Object.keys(q);
return pids.length === 0 || pids.every((pid) => !q[pid]?.length);
}
function rankOrder(rank) {
return rank === "preferred"
? 2
: rank === "normal"
? 1
: rank === "deprecated"
? 0
: -1;
}
/**
* Pick the claim to keep when merging duplicates.
* @param {Object} [opts]
* @param {boolean} [opts.preferDeprecated=false] – when true and no preferred
* claim is present in the group, choose a deprecated claim as target rather
* than a normal one. Used when a normal duplicate is being folded into an
* intentionally-deprecated claim that carries P2241.
*/
function chooseMergeTarget(claims, { preferDeprecated = false } = {}) {
let pool;
if (preferDeprecated && !claims.some((c) => c.rank === "preferred")) {
// All claims are normal/deprecated: prefer the deprecated claim as target.
pool = claims.some((c) => c.rank === "deprecated")
? claims.filter((c) => c.rank === "deprecated")
: claims;
} else {
pool = claims.some((c) => c.rank !== "deprecated")
? claims.filter((c) => c.rank !== "deprecated")
: claims;
}
return pool.reduce((a, b) => {
if (rankOrder(a.rank) !== rankOrder(b.rank))
return rankOrder(b.rank) > rankOrder(a.rank) ? b : a;
const aRefs = (a.references || []).length;
const bRefs = (b.references || []).length;
if (aRefs !== bRefs) return bRefs > aRefs ? b : a;
const aQ = Object.values(a.qualifiers || {}).reduce(
(s, arr) => s + arr.length,
0,
);
const bQ = Object.values(b.qualifiers || {}).reduce(
(s, arr) => s + arr.length,
0,
);
if (aQ !== bQ) return bQ > aQ ? b : a;
return (b.id || "") < (a.id || "") ? b : a;
});
}
/** Parse P813 (retrieved) as a timestamp (ms), or null. */
function parseRetrievedTimestamp(ref) {
try {
const snak = (ref.snaks?.[PID_RETRIEVED] || [])[0];
if (!snak?.datavalue?.value?.time) return null;
const d = parseWikibaseTime(snak.datavalue.value.time);
return d ? d.getTime() : null;
} catch {
return null;
}
}
function isValidWikipediaEdition(langcode) {
for (const [, code] of wikipediaEditionsCache) {
if (code === langcode) return true;
}
return false;
}
/**
* Inspects the P143 (imported from) and P4656 (Wikimedia import URL) snaks
* of a reference and returns:
* { language: string|null, hasMissingSitelink: boolean }
* hasMissingSitelink is true when the Wikipedia edition implied by the
* reference is not among the item's sitelinks.
*/
function analyzeWikimediaReference(entity, ref) {
if (!ref?.snaks) return { language: null, hasMissingSitelink: false };
const sitelinks = entity?.sitelinks || {};
let language = null;
let hasMissingSitelink = false;
// P143 (imported from Wikimedia project)
for (const snak of ref.snaks[PID_IMPORTED_FROM] || []) {
const qid = snak.datavalue?.value?.id;
if (qid && wikipediaEditionsCache.has(qid)) {
const lang = wikipediaEditionsCache.get(qid);
if (!language) language = lang;
if (!sitelinks[lang + "wiki"]) hasMissingSitelink = true;
}
}
// P4656 (Wikimedia import URL)
for (const snak of ref.snaks[PID_WIKIMEDIA_IMPORT_URL] || []) {
const url = snak.datavalue?.value;
if (!url) continue;
try {
const { hostname } = new URL(url);
const m = hostname
.toLowerCase()
.match(/^(?:www\.)?([a-z]+)\.([a-z]+(?:\.[a-z]+)?)$/);
if (!m) continue;
const [, lang, domain] = m;
if (lang === "wikidata") continue;
if (!language) language = lang;
let project = domain.split(".")[0];
if (project === "wikipedia" || project === "wikimedia")
project = "wiki";
if (!sitelinks[lang + project]) {
if (isValidWikipediaEdition(lang)) hasMissingSitelink = true;
else
console.warn(
`${TOOL_NAME}: "${lang}" from "${url}" not in Wikipedia editions cache`,
);
}
} catch {
continue;
}
}
return { language, hasMissingSitelink };
}
function findClaimById(entity, claimId) {
if (!entity?.claims || !claimId) return null;
for (const pid in entity.claims) {
const found = entity.claims[pid].find((c) => c.id === claimId);
if (found) return found;
}
return null;
}
/**
* Returns true when a claim is eligible for value-based deduplication in
* detectDuplicateValues. A claim qualifies when it has no qualifiers at all,
* or only the rank-reason qualifier (P7452 / P2241) and/or P1810, so that
* the value alone is sufficient to identify duplicates.
*/
function canBeGrouped(claim) {
if (hasNoQualifiers(claim)) return true;
const qPids = Object.keys(claim.qualifiers || {}).filter(
(p) => (claim.qualifiers[p] || []).length > 0,
);
// External-id with only P1810 (subject named as).
if (
claim.mainsnak?.datatype === "external-id" &&
qPids.length === 1 &&
qPids[0] === PID_SUBJECT_NAMED_AS
)
return true;
// Preferred rank with only P7452 (reason for preferred rank), optionally
// also P1810 (subject named as).
if (
claim.rank === "preferred" &&
qPids.includes(PID_REASON_FOR_PREFERRED_RANK) &&
qPids.every(
(p) =>
p === PID_REASON_FOR_PREFERRED_RANK || p === PID_SUBJECT_NAMED_AS,
)
)
return true;
// Deprecated rank with only P2241 (reason for deprecated rank), optionally
// also P1810 (subject named as).
if (
claim.rank === "deprecated" &&
qPids.includes(PID_REASON_FOR_DEPRECATED_RANK) &&
qPids.every(
(p) =>
p === PID_REASON_FOR_DEPRECATED_RANK || p === PID_SUBJECT_NAMED_AS,
)
)
return true;
return false;
}
function getReferenceUrl(snaks) {
const urlSnaks = snaks[PID_REFERENCE_URL] || [];
if (urlSnaks.length !== 1) return null;
const v = urlSnaks[0].datavalue?.value;
return typeof v === "string" ? v : null;
}
function hasExternalId(snaks) {
return Object.keys(snaks).some((pid) =>
(snaks[pid] || []).some((s) => s.datatype === "external-id"),
);
}
/** Wraps a regex pattern with ^ / $ anchors and compiles it with Unicode mode. */
function compileAnchoredRegex(pattern) {
let p = pattern;
if (!p.startsWith("^")) p = "^" + p;
if (!p.endsWith("$")) p = p + "$";
return new RegExp(p, "u");
}
// ==== 13 Occupation helpers ===============================================
async function buildOccupationParents(occIds) {
const ids = uniq(occIds);
if (!ids.length) return occupationParentsCache;
let toFetchSet = new Set(
ids.filter((id) => !occupationParentsCache.has(id)),
);
if (!toFetchSet.size) return occupationParentsCache;
for (
let depth = 0;
depth < MAX_TRAVERSAL_DEPTH && toFetchSet.size;
depth++
) {
const toFetch = Array.from(toFetchSet);
toFetchSet = new Set();
try {
const entities = await api_fetchEntities(toFetch);
for (const id of toFetch) {
const parents = extractP279Parents(entities[id]);
occupationParentsCache.set(id, parents);
for (const p of parents) {
if (!occupationParentsCache.has(p)) toFetchSet.add(p);
}
}
const occEntry = caches.find((c) => c.key === OCC_CACHE_KEY);
if (occEntry) cache_saveLocalSt(occEntry);
} catch (e) {
console.error(
`${TOOL_NAME}: buildOccupationParents failed`,
toFetch,
e,
);
for (const id of toFetch) {
if (!occupationParentsCache.has(id))
occupationParentsCache.set(id, new Set());
}
const occEntry = caches.find((c) => c.key === OCC_CACHE_KEY);
if (occEntry) cache_saveLocalSt(occEntry);
}
}
return occupationParentsCache;
}
function isSubclassOfLocal(strongId, weakId, maxDepth = MAX_TRAVERSAL_DEPTH) {
if (!isQid(strongId) || !isQid(weakId)) return false;
if (strongId === weakId) return true;
const visited = new Set([strongId]);
let frontier = [strongId];
for (let depth = 0; depth < maxDepth && frontier.length; depth++) {
const next = [];
for (const node of frontier) {
for (const p of occupationParentsCache.get(node) || new Set()) {
if (p === weakId) return true;
if (!visited.has(p)) {
visited.add(p);
next.push(p);
}
}
}
frontier = next;
}
return false;
}
// ==== 14 External-ID validation ===========================================
function validateExternalIdValue(pid, value) {
if (typeof value !== "string")
return { valid: false, reason: "not_a_string" };
if (value.trim() !== value)
return { valid: false, reason: "leading_or_trailing_whitespace" };
if (/\t|\v|\f/.test(value))
return { valid: false, reason: "contains_tab_or_vertical_whitespace" };
const pattern = propertyRegexCache.get(pid);
if (pattern) {
try {
if (!compileAnchoredRegex(pattern).test(value))
return { valid: false, reason: "regex_mismatch" };
} catch (err) {
console.error(`${TOOL_NAME}: bad regex for ${pid}:`, pattern, err);
return { valid: false, reason: "regex_error", error: err.message };
}
} else {
if (value.endsWith("/") || value.endsWith("\\"))
return { valid: false, reason: "ends_with_slash" };
}
return { valid: true };
}
// ==== 15 URL->property matching ============================================
/** Try to extract the external-ID value from a regex match. */
function extractId(searchUrl, regex, matchResult, replacement, propertyId) {
if (replacement) {
return searchUrl.replace(regex, replacement.replace(/\\(\d+)/g, "$$$1"));
}
const groups = matchResult.slice(1);
if (!groups.length) {
console.warn(
`${TOOL_NAME}: match with ${propertyId} but zero groups; url=${searchUrl}`,
);
return null;
}
if (groups.length === 1) return groups[0];
// Multiple groups: accept if all non-null ones are substrings of groups[0]
const first = matchResult[1];
for (let i = 2; i < matchResult.length; i++) {
const g = matchResult[i];
if (g != null && !first.includes(g)) {
console.warn(
`${TOOL_NAME}: multiple independent groups for ${propertyId}; url=${searchUrl}`,
);
return null;
}
}
console.info(
`${TOOL_NAME}: accepting ${propertyId} with nested groups; url=${searchUrl}`,
);
return first;
}
/**
* Match a URL against all cached property URL-match patterns.
* Returns { matched, suggestedProperty, matchedPatternObj, extractedId }.
*/
function matchUrlAgainstPatterns(searchUrl) {
for (const [propertyId, patternObjs] of propertyUrlPatternsCache) {
if (!Array.isArray(patternObjs)) continue;
for (const patternObj of patternObjs) {
try {
const regex = compileAnchoredRegex(patternObj.pattern);
const matchResult = regex.exec(searchUrl);
if (!matchResult) continue;
const extractedId = extractId(
searchUrl,
regex,
matchResult,
patternObj.replacement,
propertyId,
);
if (!extractedId) continue;
const validation = validateExternalIdValue(propertyId, extractedId);
if (validation.valid) {
return {
matched: true,
suggestedProperty: propertyId,
matchedPatternObj: patternObj,
extractedId,
};
}
console.warn(
`${TOOL_NAME}: extracted ID "${extractedId}" for ${propertyId} failed validation: ${validation.reason}`,
);
} catch (e) {
console.warn(
`${TOOL_NAME}: invalid pattern for ${propertyId}:`,
patternObj,
e,
);
}
}
}
return {
matched: false,
suggestedProperty: null,
matchedPatternObj: null,
extractedId: null,
};
}
/**
* Normalise a raw URL and try to match it against all cached property URL patterns.
* First tries the URL as-is; if that fails, tries a recognition-mode cleaned version
* (strips functional/UI params). Returns { matched, suggestedProperty, matchedPatternObj,
* extractedId, keepUrl } — keepUrl is true when params were stripped that should be
* kept in the reference URL so the original link still works.
*/
function matchUrlAgainstPatternsWithCleanup(rawUrl) {
const searchUrl = normalizeUrl(rawUrl);
let result = matchUrlAgainstPatterns(searchUrl);
let keepUrl = false;
if (!result.matched) {
const { url: cleaned, keepUrl: kUrl } = cleanUrl(searchUrl, {
recognitionMode: true,
});
const normalizedCleaned = normalizeUrl(cleaned);
if (normalizedCleaned !== searchUrl) {
result = matchUrlAgainstPatterns(normalizedCleaned);
keepUrl = kUrl;
}
}
return { ...result, keepUrl };
}
// ==== 16 Diff builders ====================================================
function makeRemoveClaimDiff(pid, removeClaim, becauseOfClaim) {
const diff = {
action: ACTION_REMOVE_CLAIM,
pid,
claimId: removeClaim.id,
value: removeClaim.mainsnak?.datavalue,
};
if (becauseOfClaim) {
diff.keepClaimId = becauseOfClaim.id;
diff.keepValue = becauseOfClaim.mainsnak?.datavalue;
}
return diff;
}
function makeNormalizeDiff(field, lang, before, after) {
return {
action: ACTION_NORMALIZE,
field,
claimId: null,
lang,
before,
after,
};
}
function makeAddExternalIdToReferenceDiff(
pid,
claim,
ref,
keepUrl,
urlValue,
suggestedProperty,
extractedId,
) {
return {
action: ACTION_ADD_EXTERNAL_ID_TO_REFERENCE,
pid,
claimId: claim.id,
refHash: ref.hash,
suggestedProperty,
keepUrl,
referenceUrl: urlValue,
extractedId,
};
}
/** Iterate all non-deprecated claims and their references, collecting non-null callback results. */
function mapReferences(entity, callback) {
const results = [];
for (const pid in entity.claims || {}) {
for (const claim of entity.claims[pid]) {
if (claim.rank === "deprecated") continue;
for (const ref of claim.references || []) {
const value = callback(pid, claim, ref);
if (value != null) results.push(value);
}
}
}
return results;
}
// ==== 17 Detectors ========================================================
function detectCleanUrls(entity) {
const diffs = [];
// Check a single URL string; returns a diff object or null.
function checkUrl(urlValue, makeDiff) {
if (typeof urlValue !== "string") return null;
const { url: cleaned } = cleanUrl(urlValue);
if (cleaned === urlValue) return null;
return makeDiff(urlValue, cleaned);
}
// 1. Top-level URL claims (mainsnak)
for (const pid in entity.claims || {}) {
for (const claim of entity.claims[pid]) {
if (claim.rank === "deprecated") continue;
if (claim.mainsnak?.datatype !== "url") continue;
const diff = checkUrl(
claim.mainsnak?.datavalue?.value,
(before, after) => ({
action: ACTION_CLEAN_URL,
context: "claim",
pid,
claimId: claim.id,
snakPid: pid,
before,
after,
}),
);
if (diff) diffs.push(diff);
}
}
// 2. Qualifier URL snaks
for (const pid in entity.claims || {}) {
for (const claim of entity.claims[pid]) {
if (claim.rank === "deprecated") continue;
for (const qPid in claim.qualifiers || {}) {
for (const snak of claim.qualifiers[qPid]) {
if (snak.datatype !== "url") continue;
const diff = checkUrl(snak.datavalue?.value, (before, after) => ({
action: ACTION_CLEAN_URL,
context: "qualifier",
pid,
claimId: claim.id,
snakPid: qPid,
snakHash: snak.hash,
before,
after,
}));
if (diff) diffs.push(diff);
}
}
}
}
// 3. Reference URL snaks (all URL-datatype PIDs, not just P854)
diffs.push(
...mapReferences(entity, (pid, claim, ref) => {
for (const rPid in ref.snaks || {}) {
for (const snak of ref.snaks[rPid]) {
if (snak.datatype !== "url") continue;
const diff = checkUrl(snak.datavalue?.value, (before, after) => ({
action: ACTION_CLEAN_URL,
context: "reference",
pid,
claimId: claim.id,
refHash: ref.hash,
snakPid: rPid,
snakHash: snak.hash,
before,
after,
}));
if (diff) return diff; // mapReferences takes the first non-null per ref
}
}
}),
);
return diffs;
}
function detectNormalizeLabels(entity) {
const diffs = [];
for (const lang in entity.labels || {}) {
const before = entity.labels[lang].value;
const after = normalizeText(before);
if (after !== before)
diffs.push(makeNormalizeDiff("label", lang, before, after));
}
for (const lang in entity.descriptions || {}) {
const before = entity.descriptions[lang].value;
const after =
normalizeText(before)
?.replace(/[;\s]+$/g, "")
.trim() ?? before;
if (after !== before)
diffs.push(makeNormalizeDiff("description", lang, before, after));
}
for (const lang in entity.aliases || {}) {
for (const a of entity.aliases[lang]) {
const after = normalizeText(a.value);
if (after !== a.value)
diffs.push(makeNormalizeDiff("alias", lang, a.value, after));
}
}
return diffs;
}
function detectRemoveDuplicateAliases(entity) {
const diffs = [];
// Build a set of normalised mul alias values for cross-language comparison.
const mulAliasNorms = new Set(
(entity.aliases?.mul || []).map((a) => normalizeText(a.value)),
);
const mulLabelNorm = entity.labels?.mul?.value
? normalizeText(entity.labels.mul.value)
: null;
for (const lang in entity.aliases || {}) {
if (lang === "mul") continue;
const labelNorm = normalizeText(entity.labels?.[lang]?.value || "");
const seen = new Set();
for (const a of entity.aliases[lang]) {
const norm = normalizeText(a.value);
// 1. Alias equals the label in the same language (original behaviour).
if (norm === labelNorm) {
diffs.push({
action: ACTION_REMOVE_ALIAS,
claimId: null,
lang,
value: a.value,
reason: "alias_equals_label",
});
continue;
}
// 2. Alias equals the mul label.
if (mulLabelNorm && norm === mulLabelNorm) {
diffs.push({
action: ACTION_REMOVE_ALIAS,
claimId: null,
lang,
value: a.value,
reason: "alias_equals_mul_label",
});
continue;
}
// 3. Alias equals a mul alias.
if (mulAliasNorms.has(norm)) {
diffs.push({
action: ACTION_REMOVE_ALIAS,
claimId: null,
lang,
value: a.value,
reason: "alias_equals_mul_alias",
});
continue;
}
// 4. Duplicate within the same language (original behaviour).
if (seen.has(norm)) {
diffs.push({
action: ACTION_REMOVE_ALIAS,
claimId: null,
lang,
value: a.value,
reason: "duplicate",
});
} else {
seen.add(norm);
}
}
}
return diffs;
}
function detectIdDescriptions(entity) {
const diffs = [];
const val = entity.descriptions?.en?.value;
if (!val) return diffs;
const patterns = [
{
regex: /^peerage person id=(\d+)$/i,
prop: "P4638",
check: (claim, id) =>
claim.mainsnak?.datavalue?.value?.includes(`#i${id}`),
removeMode: "blank",
},
{
regex:
/\borcid\s*(?:id\s*)?(?:[=#:]\s*)?\(?\s*(\d{4}-\d{4}-\d{4}-\d{3}[0-9X])\s*\)?/i,
prop: "P496",
check: (claim, id) => claim.mainsnak?.datavalue?.value === id,
removeMode: "partial",
},
];
for (const pattern of patterns) {
const match = pattern.regex.exec(val);
if (!match) continue;
const descId = match[1];
const idPresent = (entity.claims?.[pattern.prop] || []).some((c) =>
pattern.check(c, descId),
);
const rawAfter =
pattern.removeMode === "partial"
? val
.replace(match[0], "")
.replace(/\(\s*\)/g, "") // remove empty () left behind
.replace(/\(([^)]*?)$/, "$1") // remove unmatched ( with no closing )
.replace(/^(.*?)\)/, "$1") // remove unmatched ) with no opening (
: "";
const after =
pattern.removeMode === "partial" ? normalizeText(rawAfter) : "";
diffs.push({
action: ACTION_NORMALIZE,
field: "description",
lang: "en",
before: val,
after,
idPresent,
});
break;
}
return diffs;
}
function detectRedundantPreferred(entity) {
const diffs = [];
for (const pid in entity.claims) {
const claims = entity.claims[pid];
if (!claims?.length) continue;
const ranks = claims.map((c) => c.rank);
const allPreferred = ranks.every((r) => r === "preferred");
const onlyPrefOrDepr = ranks.every(
(r) => r === "preferred" || r === "deprecated",
);
if (allPreferred || onlyPrefOrDepr) {
for (const c of claims) {
if (c.rank === "preferred") {
diffs.push({
action: ACTION_DOWNGRADE_PREFERRED,
pid,
claimId: c.id,
value: c.mainsnak?.datavalue,
removedQualifier: c.qualifiers?.[PID_REASON_FOR_PREFERRED_RANK]
? PID_REASON_FOR_PREFERRED_RANK
: null,
});
}
}
}
}
return diffs;
}
function detectExpiredPreferred(entity) {
const diffs = [];
for (const pid in entity.claims || {}) {
for (const c of entity.claims[pid]) {
if (c.rank !== "preferred") continue;
const val = (c.qualifiers?.[PID_END_TIME] || [])[0]?.datavalue?.value;
if (!val?.time) continue;
const endDate = parseWikibaseTime(val.time);
if (endDate && endDate < new Date()) {
diffs.push({
action: ACTION_DOWNGRADE_PREFERRED,
pid,
claimId: c.id,
value: c.mainsnak?.datavalue,
removedQualifier: c.qualifiers?.[PID_REASON_FOR_PREFERRED_RANK]
? PID_REASON_FOR_PREFERRED_RANK
: null,
});
}
}
}
return diffs;
}
function detectEmptyEndTime(entity) {
const diffs = [];
for (const pid in entity.claims || {}) {
for (const c of entity.claims[pid]) {
for (const snak of c.qualifiers?.[PID_END_TIME] || []) {
if (snak.snaktype === "novalue") {
diffs.push({
action: ACTION_REMOVE_QUALIFIER,
claimId: c.id,
pid,
qualifierPid: PID_END_TIME,
qualifierSnakHash: snak.hash,
qualifierValue: snak.datavalue,
});
}
}
}
}
return diffs;
}
/**
* Detect redundant start/end time qualifiers on P27 (country of citizenship)
* that duplicate the item's date of birth (P569) or date of death (P570).
*
* Rules:
* - P580 (start time) on a P27 claim is redundant when it equals P569 (DoB)
* at the lowest common precision.
* - P582 (end time) on a P27 claim is redundant when it equals P570 (DoD)
* at the lowest common precision.
*
* Guards:
* - Deprecated P27 claims are skipped.
* - The reference date (DoB / DoD) is resolved as follows:
* 1. Collect all non-deprecated claims for the property.
* 2. If any are preferred rank, keep only those; otherwise keep all normal-rank.
* 3. If more than one candidate remains after that, skip (ambiguous).
* - Precision must be year (9) or finer on both sides; coarser values are skipped.
* - Only time-datatype qualifier snaks are considered.
*/
function detectRedundantCitizenshipDates(entity) {
const diffs = [];
/**
* Resolve the single authoritative date value for a date property (P569/P570).
* Returns the WbTime value object, or null when the result is ambiguous or absent.
*/
function resolveDateValue(pid) {
const all = (entity.claims?.[pid] || []).filter(
(c) => c.rank !== "deprecated",
);
if (!all.length) return null;
const preferred = all.filter((c) => c.rank === "preferred");
const candidates = preferred.length ? preferred : all;
// More than one candidate → ambiguous, skip.
if (candidates.length !== 1) return null;
const val = candidates[0].mainsnak?.datavalue?.value;
// Must be a time value with year-or-finer precision.
if (!val?.time || typeof val.precision !== "number" || val.precision < 9)
return null;
return val;
}
const dobValue = resolveDateValue(PID_DATE_OF_BIRTH);
const dodValue = resolveDateValue(PID_DATE_OF_DEATH);
// Nothing to compare against — bail out early.
if (!dobValue && !dodValue) return diffs;
// Pairs of [qualifier PID, reference WbTime value] to check.
const checks = [
{ qualifierPid: PID_START_TIME, refValue: dobValue },
{ qualifierPid: PID_END_TIME, refValue: dodValue },
].filter((c) => c.refValue !== null);
for (const claim of entity.claims?.[PID_CITIZENSHIP] || []) {
if (claim.rank === "deprecated") continue;
for (const { qualifierPid, refValue } of checks) {
for (const snak of claim.qualifiers?.[qualifierPid] || []) {
if (snak.snaktype !== "value") continue;
const qualVal = snak.datavalue?.value;
if (!qualVal?.time || typeof qualVal.precision !== "number") continue;
// Both sides must be year-or-finer.
if (qualVal.precision < 9) continue;
// Build minimal pseudo-claims so we can reuse has_same_normalized_date.
const pseudoQual = { mainsnak: { datavalue: { value: qualVal } } };
const pseudoRef = { mainsnak: { datavalue: { value: refValue } } };
if (!has_same_normalized_date(pseudoQual, pseudoRef, true, false))
continue;
diffs.push({
action: ACTION_REMOVE_QUALIFIER,
pid: PID_CITIZENSHIP,
claimId: claim.id,
qualifierPid,
qualifierSnakHash: snak.hash,
qualifierValue: snak.datavalue,
// Extra context for renderRow
matchedPid:
qualifierPid === PID_START_TIME
? PID_DATE_OF_BIRTH
: PID_DATE_OF_DEATH,
});
}
}
}
return diffs;
}
function detectLowPrecisionDates(entity) {
const diffs = [];
for (const pid of [PID_DATE_OF_BIRTH, PID_DATE_OF_DEATH]) {
const claims = entity.claims?.[pid] || [];
// Group by same-date at any precision
const groups = [];
for (const c of claims) {
if (c.rank === "deprecated") continue;
const prec = c.mainsnak?.datavalue?.value?.precision ?? 0;
if (prec > 11) continue;
const group = groups.find((g) =>
has_same_normalized_date(c, g[0], true, false),
);
if (group) group.push(c);
else groups.push([c]);
}
for (const group of groups) {
if (group.length < 2) continue;
const maxPrec = Math.max(
...group.map((c) => c.mainsnak.datavalue.value.precision || 0),
);
let highestNoRefPrec = 0;
let bestNoRefClaim = null;
for (const c of group) {
if (
!(c.references || []).length &&
c.mainsnak.datavalue.value.precision > highestNoRefPrec
) {
highestNoRefPrec = c.mainsnak.datavalue.value.precision;
bestNoRefClaim = c;
}
}
const preciseStrongClaim = group.find(
(c) =>
c.mainsnak.datavalue.value.precision === maxPrec &&
c.references?.length,
);
for (const c of group) {
const prec = c.mainsnak.datavalue.value.precision;
const refs = c.references || [];
if (prec < maxPrec) {
const allWeak =
!refs.length ||
refs.every((r) => isWeakReference(entity, r, refs));
if (preciseStrongClaim && allWeak)
diffs.push(makeRemoveClaimDiff(pid, c, preciseStrongClaim));
if (!refs.length && prec < highestNoRefPrec && bestNoRefClaim) {
diffs.push(makeRemoveClaimDiff(pid, c, bestNoRefClaim));
}
}
}
}
}
return diffs;
}
function detectMergeSameDateClaims(entity) {
const diffs = [];
const visited = new Set();
for (const pid of getDateProperties(entity)) {
const claims = entity.claims?.[pid] || [];
visited.clear();
for (let i = 0; i < claims.length; i++) {
const base = claims[i];
if (visited.has(base.id)) continue;
const group = [base];
visited.add(base.id);
for (let j = i + 1; j < claims.length; j++) {
const cand = claims[j];
if (visited.has(cand.id)) continue;
if (has_same_normalized_date(base, cand, false, false)) {
group.push(cand);
visited.add(cand.id);
}
}
if (group.length < 2) continue;
// Sub-group by rank + qualifiers (ignoring P31:Q26961029)
const subgroups = [];
for (const claim of group) {
const sg = subgroups.find(
(g) =>
g.rank === claim.rank &&
qualifiersEqualExceptP31(g.claims[0], claim),
);
if (sg) sg.claims.push(claim);
else subgroups.push({ rank: claim.rank, claims: [claim] });
}
for (const { claims: sgClaims } of subgroups) {
if (sgClaims.length < 2) continue;
const target = chooseMergeTarget(sgClaims);
for (const c of sgClaims) {
if (c.id !== target.id) {
diffs.push({
action: ACTION_MERGE_CLAIM,
pid,
fromClaimId: c.id,
toClaimId: target.id,
value: c.mainsnak?.datavalue,
});
}
}
}
}
}
return diffs;
}
function detectWrongPropertyClaims(entity) {
const diffs = [];
const RULES = [
{
props: new Set([PID_URL, PID_REFERENCE_URL]),
newProp: PID_ARCHIVE_URL,
checkFn: isArchiveUrl,
},
{
props: new Set([PID_URL, PID_REFERENCE_URL]),
newProp: PID_WIKIMEDIA_IMPORT_URL,
checkFn: isWikimediaImportUrl,
},
{ props: new Set([PID_URL]), newProp: PID_REFERENCE_URL },
];
for (const pid in entity.claims || {}) {
for (const c of entity.claims[pid]) {
for (const ref of c.references || []) {
for (const wrongProp in ref.snaks || {}) {
for (const s of ref.snaks[wrongProp]) {
const val = s.datavalue?.value;
for (const rule of RULES) {
if (!rule.props.has(wrongProp)) continue;
const matches = rule.checkFn
? typeof val === "string" && rule.checkFn(val)
: true;
if (matches) {
diffs.push({
action: ACTION_CHANGE_PROPERTY,
context: "reference",
pid,
claimId: c.id,
refHash: ref.hash,
snakHash: s.hash,
oldProperty: wrongProp,
newProperty: rule.newProp,
value: s.datavalue,
});
break; // first matching rule wins
}
}
}
}
}
}
}
return diffs;
}
function detectMoveRetrievedFromExternalId(entity) {
const diffs = [];
for (const pid in entity.claims || {}) {
for (const c of entity.claims[pid]) {
if (c.mainsnak?.datatype !== "external-id") continue;
for (const qSnak of c.qualifiers?.[PID_RETRIEVED] || []) {
diffs.push({
action: ACTION_MOVE_QUALIFIER_TO_REFERENCE,
claimId: c.id,
pid,
qualifierPid: PID_RETRIEVED,
qualifierSnakHash: qSnak.hash,
qualifierValue: qSnak.datavalue,
});
}
}
}
return diffs;
}
function detectDuplicateValues(entity) {
const diffs = [];
for (const pid in entity.claims || {}) {
const claims = entity.claims[pid] || [];
if (!claims.length) continue;
const groups = new Map();
// First pass: non-normal claims.
// Preferred claims go into an "active" bucket; deprecated into "deprecated".
for (const claim of claims) {
if (claim.rank === "normal") continue;
if (!canBeGrouped(claim)) continue;
const rankGroup = claim.rank === "deprecated" ? "deprecated" : "active";
const key = `${normalizeDataValueKey(claim.mainsnak?.datavalue?.value)}|${rankGroup}`;
if (!groups.has(key)) groups.set(key, []);
groups.get(key).push(claim);
}
// Second pass: normal claims.
// Each normal claim is placed into the first matching group in priority order:
// 1. An existing "active" (preferred) group with the same value.
// 2. An existing "deprecated" group with the same value.
// 3. A (possibly new) "active" group for normal-only deduplication.
for (const claim of claims) {
if (claim.rank !== "normal") continue;
if (!canBeGrouped(claim)) continue;
const valueKey = normalizeDataValueKey(
claim.mainsnak?.datavalue?.value,
);
const activeKey = `${valueKey}|active`;
const deprecatedKey = `${valueKey}|deprecated`;
if (groups.has(activeKey)) {
groups.get(activeKey).push(claim);
} else if (groups.has(deprecatedKey)) {
groups.get(deprecatedKey).push(claim);
} else {
groups.set(activeKey, [claim]);
}
}
// Emit merges for every group that contains two or more claims.
// For deprecated groups, pass preferDeprecated so that a deprecated claim
// (with its P2241 reason) is kept as the target when a normal claim was
// added to the group.
for (const [key, dupes] of groups) {
if (dupes.length < 2) continue;
const target = chooseMergeTarget(dupes, {
preferDeprecated: key.endsWith("|deprecated"),
});
for (const claim of dupes) {
if (claim.id !== target.id) {
diffs.push({
action: ACTION_MERGE_CLAIM,
pid,
fromClaimId: claim.id,
toClaimId: target.id,
value: claim.mainsnak?.datavalue,
});
}
}
}
}
return diffs;
}
function detectDuplicateRefs(entity) {
const diffs = [];
const IGNORE_PROPS = new Set(["P813", "P1476", "P1810"]);
function buildFieldMap(ref) {
const map = {};
for (const pid of Object.keys(ref.snaks || {})) {
if (IGNORE_PROPS.has(pid)) continue;
const vals = (ref.snaks[pid] || [])
.map((s) => JSON.stringify(s.datavalue?.value ?? null))
.sort();
if (vals.length) map[pid] = vals;
}
return map;
}
function countProps(ref) {
return Object.keys(ref.snaks || {}).filter((p) => !IGNORE_PROPS.has(p))
.length;
}
function isSubset(mapA, mapB) {
for (const pid in mapA) {
if (!mapB[pid]) return false;
const setB = new Set(mapB[pid]);
if (!mapA[pid].every((v) => setB.has(v))) return false;
}
return true;
}
for (const pid in entity.claims || {}) {
for (const claim of entity.claims[pid]) {
const refs = claim.references || [];
if (refs.length < 2) continue;
// Build metadata map once
const meta = new Map(
refs.map((r) => [
r.hash,
{
map: buildFieldMap(r),
ts: parseRetrievedTimestamp(r) ?? -Infinity,
propCount: countProps(r),
},
]),
);
const sorted = [...refs].sort((a, b) => {
const ma = meta.get(a.hash);
const mb = meta.get(b.hash);
if (mb.ts !== ma.ts) return mb.ts - ma.ts;
if (mb.propCount !== ma.propCount) return mb.propCount - ma.propCount;
if (isSubset(ma.map, mb.map) && !isSubset(mb.map, ma.map)) return 1;
if (isSubset(mb.map, ma.map) && !isSubset(ma.map, mb.map)) return -1;
return a.hash.localeCompare(b.hash);
});
const groups = [];
for (const ref of sorted) {
const m = meta.get(ref.hash);
if (!m.propCount) continue;
const anchor = groups.find(
(g) =>
isSubset(m.map, meta.get(g.anchorRef.hash).map) &&
m.ts <= meta.get(g.anchorRef.hash).ts,
);
if (anchor) {
anchor.members.push(ref);
} else {
groups.push({ anchorRef: ref, members: [] });
}
}
for (const g of groups) {
for (const ref of g.members) {
diffs.push({
action: ACTION_REMOVE_REFS,
pid,
claimId: claim.id,
refHash: ref.hash,
removedKeys: Object.keys(ref.snaks),
});
}
}
}
}
return diffs;
}
/**
* Determines whether a reference that carries multiple URL snaks can be safely
* split into one reference per URL. Returns { splittable: boolean, urlCount? }.
* Only references whose snaks are a subset of the allowed PIDs and that contain
* more than one non-archive / non-retrieved URL are considered splittable.
*/
function isSplittableReference(ref) {
const snaks = ref.snaks || {};
const pids = Object.keys(snaks);
const ALLOWED = [
PID_REFERENCE_URL,
PID_RETRIEVED,
PID_ARCHIVE_URL,
PID_ARCHIVE_DATE,
PID_IMPORTED_FROM,
PID_WIKIMEDIA_IMPORT_URL,
];
if (!pids.includes(PID_REFERENCE_URL)) return { splittable: false };
if (!pids.every((p) => ALLOWED.includes(p))) return { splittable: false };
let archiveCount = 0,
wikimediaCount = 0,
urlCount = 0,
otherCount = 0;
for (const p of pids) {
if (p === PID_RETRIEVED || p === PID_ARCHIVE_DATE) continue;
for (const snak of snaks[p] || []) {
if (snak.datatype === "url") {
const v = snak.datavalue?.value;
if (!v || typeof v !== "string") continue;
if (isArchiveUrl(v) || p === PID_ARCHIVE_URL) archiveCount++;
else if (isWikimediaImportUrl(v)) wikimediaCount++;
else urlCount++;
} else {
otherCount += (snaks[p] || []).length;
}
}
}
if (archiveCount > 1) return { splittable: false };
if ((snaks[PID_ARCHIVE_DATE] || []).length > 1)
return { splittable: false };
if (wikimediaCount + urlCount + otherCount <= 1)
return { splittable: false };
const totalUrls = archiveCount + wikimediaCount + urlCount;
return { splittable: true, urlCount: totalUrls };
}
function detectMultipleReferenceUrls(entity) {
return mapReferences(entity, (pid, claim, ref) => {
const result = isSplittableReference(ref);
if (!result.splittable) return null;
return {
action: ACTION_SPLIT_REFERENCE_URLS,
pid,
claimId: claim.id,
refHash: ref.hash,
urlCount: result.urlCount,
};
});
}
function detectAddExternalIdToReference(entity) {
const urlMatchCache = new Map();
return mapReferences(entity, (currentPid, claim, ref) => {
const snaks = ref.snaks || {};
const urlValue = getReferenceUrl(snaks);
if (!urlValue || hasExternalId(snaks)) return null;
const cacheKey = normalizeUrl(urlValue);
if (urlMatchCache.has(cacheKey)) {
const cached = urlMatchCache.get(cacheKey);
if (!cached.matched || currentPid === cached.suggestedProperty)
return null;
return makeAddExternalIdToReferenceDiff(
currentPid,
claim,
ref,
cached.keepUrl,
urlValue,
cached.suggestedProperty,
cached.extractedId,
);
}
const { keepUrl, ...result } =
matchUrlAgainstPatternsWithCleanup(urlValue);
urlMatchCache.set(cacheKey, { ...result, keepUrl });
if (
!result.matched ||
!result.extractedId ||
currentPid === result.suggestedProperty
)
return null;
return makeAddExternalIdToReferenceDiff(
currentPid,
claim,
ref,
keepUrl,
urlValue,
result.suggestedProperty,
result.extractedId,
);
});
}
async function detectRedundantOccupation(entity) {
const occupations = (entity.claims[PID_OCCUPATION] || []).filter(
(c) => c.rank === "normal" || c.rank === "preferred",
);
const weakOccs = occupations.filter(
(c) => c.rank === "normal" && !hasValidReference(entity, c),
);
const strongOccs = occupations.filter(
(c) => hasValidReference(entity, c) || c.rank === "preferred",
);
if (!weakOccs.length || !strongOccs.length) return [];
const allIds = [
...weakOccs.map((c) => c.mainsnak.datavalue.value.id),
...strongOccs.map((c) => c.mainsnak.datavalue.value.id),
];
try {
await buildOccupationParents(allIds);
} catch (e) {
console.error(`${TOOL_NAME}: buildOccupationParents failed`, e);
return [];
}
const diffs = [];
for (const weakClaim of weakOccs) {
const weakId = weakClaim.mainsnak.datavalue.value.id;
for (const strongClaim of strongOccs) {
const strongId = strongClaim.mainsnak.datavalue.value.id;
if (strongId !== weakId && isSubclassOfLocal(strongId, weakId)) {
diffs.push(
makeRemoveClaimDiff(PID_OCCUPATION, weakClaim, strongClaim),
);
break;
}
}
}
return diffs;
}
function detectJulianGregorianDuplicateDates(entity) {
const diffs = [];
for (const pid of getDateProperties(entity)) {
const claims = entity.claims?.[pid] || [];
const unrefClaims = claims.filter((c) => !(c.references || []).length);
const refClaims = claims.filter((c) => (c.references || []).length > 0);
for (const a of unrefClaims) {
const aVal = a.mainsnak?.datavalue?.value;
if (!aVal) continue;
for (const b of refClaims) {
const bVal = b.mainsnak?.datavalue?.value;
if (!bVal) continue;
if (
aVal.calendarmodel !== bVal.calendarmodel &&
has_same_normalized_date(a, b, false, true)
) {
diffs.push(makeRemoveClaimDiff(pid, a, b));
}
}
}
}
return diffs;
}
/**
* Detect references that contain at least one obsolete external-ID snak
* alongside at least one other external-ID snak (non-obsolete), and no
* P813 (retrieved) — which is typical of book/bibliographic references.
* In those cases only the obsolete snaks are stripped; the rest of the
* reference is preserved.
*
* Guards:
* - The reference must have no P813 (retrieved) snak.
* - The reference must contain at least one obsolete external-ID snak.
* - The reference must contain at least one other external-ID snak that
* is not obsolete (the "surviving content" guard).
*
* Unlike the old partial_obsolete path this fires unconditionally —
* regardless of whether stronger references exist on the same statement —
* because the surviving content in the reference is itself meaningful.
*/
function detectObsoleteSnaksInReferences(entity) {
const diffs = [];
for (const pid in entity.claims || {}) {
for (const claim of entity.claims[pid]) {
if (claim.rank === "deprecated") continue;
for (const ref of claim.references || []) {
const snaks = ref.snaks || {};
const pids = Object.keys(snaks);
// Skip if there is a P813 (retrieved) snak.
if (pids.includes(PID_RETRIEVED)) continue;
// Collect obsolete external-ID PIDs in this reference.
const obsoletePids = pids.filter(
(p) =>
obsoleteIdProps.has(p) &&
(snaks[p] || []).some(
(s) => s.datatype === "external-id" && s.datavalue?.value,
),
);
if (!obsoletePids.length) continue;
// Require at least one surviving non-obsolete external-ID snak.
const hasOtherExtId = pids.some(
(p) =>
!obsoletePids.includes(p) &&
(snaks[p] || []).some(
(s) => s.datatype === "external-id" && s.datavalue?.value,
),
);
if (!hasOtherExtId) continue;
diffs.push({
action: ACTION_REMOVE_OBSOLETE_SNAKS,
pid,
claimId: claim.id,
refHash: ref.hash,
obsoletePids,
removedKeys: obsoletePids,
});
}
}
}
return diffs;
}
/**
* Detect date properties that have exactly one normal-rank claim and one
* deprecated-rank claim where:
* - The deprecated claim's reason-for-deprecated-rank (P2241) is
* Q42727519 (less precise/accurate), OR it carries no P2241 qualifier at all.
* - The two dates are equal at the lowest common precision.
*
* Proposed fix (two diffs per matching pair):
* 1. Upgrade the normal-rank claim to preferred rank and set P7452 = Q71536040
* (most precise value) — ACTION_UPGRADE_PRECISE_DATE on the precise claim.
* 2. Downgrade the deprecated claim back to normal rank and strip its P2241
* qualifier — reuses ACTION_DOWNGRADE_PREFERRED (rank change + qualifier removal)
* on the deprecated (less-precise) claim.
*/
function detectUpgradePreciseDate(entity) {
const diffs = [];
for (const pid of getDateProperties(entity)) {
const claims = entity.claims?.[pid] || [];
// Partition into normal and deprecated (ignore preferred/other)
const normalClaims = claims.filter((c) => c.rank === "normal");
const deprecatedClaims = claims.filter((c) => c.rank === "deprecated");
if (normalClaims.length !== 1 || deprecatedClaims.length !== 1) continue;
for (const normalClaim of normalClaims) {
const normalPrec =
normalClaim.mainsnak?.datavalue?.value?.precision ?? -1;
if (normalPrec < 0) continue;
for (const deprClaim of deprecatedClaims) {
const deprPrec =
deprClaim.mainsnak?.datavalue?.value?.precision ?? -1;
if (deprPrec < 0) continue;
// The normal claim should be at least as precise as the deprecated one.
// (Typically day vs year: 11 >= 9.)
if (normalPrec < deprPrec) continue;
// Check dates are the same at the lower precision.
if (!has_same_normalized_date(normalClaim, deprClaim, true, false))
continue;
// The normal claim must have at least one reference.
if (!(normalClaim.references || []).length) continue;
// The deprecated claim must have P2241 = Q42727519, or no P2241 at all.
const p2241Snaks =
deprClaim.qualifiers?.[PID_REASON_FOR_DEPRECATED_RANK] || [];
if (p2241Snaks.length > 0) {
const allLessPrecise = p2241Snaks.every(
(s) => s.datavalue?.value?.id === QID_LESS_PRECISE,
);
if (!allLessPrecise) continue;
}
// Emit two linked diffs — one visible (upgrade), one hidden (downgrade):
// 1. Upgrade the precise (normal) claim to preferred + add P7452 = Q71536040
const upgradeRowId = `upgradePreciseDate_${pid}_${normalClaim.id}`;
diffs.push({
action: ACTION_UPGRADE_PRECISE_DATE,
rowId: upgradeRowId,
pid,
claimId: normalClaim.id,
value: normalClaim.mainsnak?.datavalue,
// paired deprecated claim (for display)
deprClaimId: deprClaim.id,
deprValue: deprClaim.mainsnak?.datavalue,
});
// 2. Downgrade the deprecated (less-precise) claim to normal + strip P2241.
// _hidden: true — applied alongside the upgrade diff above but not
// shown as a separate row in the preview table.
// Shares the same rowId so the checkbox state is inherited from the
// visible upgrade row.
diffs.push({
action: ACTION_DOWNGRADE_PREFERRED,
_hidden: true,
rowId: upgradeRowId,
pid,
claimId: deprClaim.id,
value: deprClaim.mainsnak?.datavalue,
// We re-use this action: the apply step sets rank = "normal" and
// strips removedQualifier. Here we point it at P2241 instead of P7452.
removedQualifier:
p2241Snaks.length > 0 ? PID_REASON_FOR_DEPRECATED_RANK : null,
// Tag so the apply step knows to go from deprecated -> normal.
fromDeprecated: true,
});
}
}
}
return diffs;
}
async function detectConvertWikipediaStatedIn(entity) {
const diffs = [];
if (!wikipediaEditionsCache.size) {
await refreshCacheWithNotify(
caches.find((c) => c.key === WIKIPEDIA_EDITIONS_CACHE_KEY),
);
}
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
for (const ref of claim.references || []) {
for (const s of ref.snaks?.[PID_STATED_IN] || []) {
const qid = s?.datavalue?.value?.id;
if (qid && wikipediaEditionsCache.has(qid)) {
diffs.push({
action: ACTION_CHANGE_PROPERTY,
context: "reference",
pid,
claimId: claim.id,
refHash: ref.hash,
snakHash: s.hash,
oldProperty: PID_STATED_IN,
newProperty: PID_IMPORTED_FROM,
value: s.datavalue,
});
}
}
}
}
}
return diffs;
}
async function detectInvalidStatedInReference(entity) {
const diffs = [];
if (!propertyStatedInCache.size) {
const entry = caches.find((c) => c.key === STATED_IN_CACHE_KEY);
if (entry) await refreshCacheWithNotify(entry);
}
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
for (const ref of claim.references || []) {
const snaks = ref.snaks || {};
const refPids = Object.keys(snaks);
let extIdPid = null;
let extIdCount = 0;
for (const snakPid of refPids) {
if (snaks[snakPid]?.[0]?.datatype === "external-id") {
extIdPid = snakPid;
extIdCount++;
}
}
const statedInSnaks = snaks[PID_STATED_IN] || [];
if (extIdCount !== 1 || statedInSnaks.length !== 1 || !extIdPid)
continue;
const statedInSnak = statedInSnaks[0];
const statedInQid = statedInSnak?.datavalue?.value?.id;
const preferences = propertyStatedInCache.get(extIdPid);
if (preferences && statedInQid) {
const { preferred, allowed, notAllowed } = preferences;
if (notAllowed?.has(statedInQid) && preferred) {
diffs.push({
action: ACTION_CHANGE_VALUE,
context: "reference",
pid,
claimId: claim.id,
refHash: ref.hash,
snakHash: statedInSnak.hash,
oldValue: statedInQid,
newValue: preferred,
externalIdPid: extIdPid,
});
}
}
}
}
}
return diffs;
}
/**
* Detect references that carry both P143 (imported from Wikimedia project) and
* P4656 (Wikimedia import URL), each with exactly one value, where the Wikipedia
* edition implied by the P4656 URL differs from what P143 currently states.
*
* For example, a P4656 value of "https://pl.wikipedia.org/w/index.php?title=…"
* implies P143 = Q1551807 (Polish Wikipedia). If P143 holds a different QID,
* this detector emits ACTION_CHANGE_VALUE to correct it.
*
* Only *.wikipedia.org hostnames are handled. Other Wikimedia projects
* (Wikibooks, Wikisource, etc.) are skipped to avoid false positives.
*/
async function detectMismatchedWikimediaImport(entity) {
const diffs = [];
if (!wikipediaEditionsCache.size) {
await refreshCacheWithNotify(
caches.find((c) => c.key === WIKIPEDIA_EDITIONS_CACHE_KEY),
);
}
// Build a reverse map: language code -> QID (first seen wins per language).
const langToQid = new Map();
for (const [qid, lang] of wikipediaEditionsCache) {
if (!langToQid.has(lang)) langToQid.set(lang, qid);
}
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
for (const ref of claim.references || []) {
const snaks = ref.snaks || {};
const p143Snaks = snaks[PID_IMPORTED_FROM] || [];
const p4656Snaks = snaks[PID_WIKIMEDIA_IMPORT_URL] || [];
// Only handle references with exactly one P143 and one P4656 snak.
if (p143Snaks.length !== 1 || p4656Snaks.length !== 1) continue;
const p143Snak = p143Snaks[0];
const p4656Snak = p4656Snaks[0];
const currentQid = p143Snak?.datavalue?.value?.id;
const importUrl = p4656Snak?.datavalue?.value;
if (!isQid(currentQid) || typeof importUrl !== "string") continue;
// Parse the URL hostname to derive the expected Wikipedia-edition QID.
let expectedQid = null;
try {
const { hostname } = new URL(importUrl);
// Accept only *.wikipedia.org (strip leading www. just in case).
const m = hostname
.toLowerCase()
.replace(/^www\./, "")
.match(/^([a-z-]+)\.wikipedia\.org$/);
if (!m) continue; // not a Wikipedia URL — skip
expectedQid = langToQid.get(m[1]);
} catch {
continue;
}
if (!expectedQid || expectedQid === currentQid) continue;
diffs.push({
action: ACTION_CHANGE_VALUE,
context: "reference",
pid,
claimId: claim.id,
refHash: ref.hash,
snakHash: p143Snak.hash,
oldValue: currentQid,
newValue: expectedQid,
importUrl,
});
}
}
}
return diffs;
}
/**
* Detect P1343 (described by source) claims whose value matches the preferred
* "applicable stated in" (P9073) of an external-id property present on the same item,
* OR whose value is a not-allowed stated-in for such a property (and is not an
* allowed value for any other external-id property on the item).
*
* Simple variant – P1343 has no qualifiers and no references:
* Emit ACTION_REMOVE_CLAIM for the P1343 claim.
*
* Complex variant – P1343 may have qualifiers (e.g. P577 publication date) and/or
* a reference that itself contains the external-id value:
* Emit ACTION_ABSORB_DESCRIBED_BY_SOURCE, which the apply step handles by:
* 1. Removing the P1343 claim.
* 2. Moving each reference from (I) to the matching external-id claim (III),
* injecting any P1343 qualifiers (IV) as extra snaks inside each moved reference.
*
* In both cases the detector only fires when:
* - The P1343 rank is not deprecated.
* - The P1343 has at most one reference (multiple refs -> skip, too ambiguous).
* - The external-id claim rank is not deprecated.
* - The P1343 value QID is either:
* (a) an allowed stated-in for at least one external-id property on this item, OR
* (b) a not-allowed stated-in for such a property (i.e. a related-entity QID
* such as issuer/maintainer/editor preloaded from the cache, or any QID
* already appearing in existing references that is not in allowed),
* provided it is not simultaneously an allowed stated-in for any other
* external-id property on this item.
*
* When multiple ext-id claims share the same property (e.g. two P243 values),
* the correct target is resolved from the P1343 reference content:
* 1. The ref contains an ext-id snak whose PID+value matches exactly one candidate.
* 2. The ref URL matches a URL pattern whose extracted ID matches a candidate.
* 3. The candidate ext-id value appears verbatim in the ref URL
* (e.g. OCLC 749229748 inside https://search.worldcat.org/title/749229748).
* If none of these resolve to exactly one candidate, the P1343 claim is skipped.
*/
async function detectAbsorbDescribedBySource(entity) {
const diffs = [];
// Ensure stated-in cache is populated
if (!propertyStatedInCache.size) {
const entry = caches.find((c) => c.key === STATED_IN_CACHE_KEY);
if (entry) await refreshCacheWithNotify(entry);
}
const describedByClaims = (
entity.claims?.[PID_DESCRIBED_BY_SOURCE] || []
).filter((c) => c.rank !== "deprecated");
if (!describedByClaims.length) return diffs;
// Build two lookup maps over external-id properties present on this item:
//
// extIdByAllowed – allowedQid -> [{extIdPid, claim, preferred}]
// for every QID that is an allowed stated-in.
//
// extIdByNotAllowed – notAllowedQid -> [{extIdPid, claim, preferred}]
// for every QID that is NOT allowed for a property
// that does have at least one allowed stated-in.
// A QID present in extIdByAllowed is never added here.
//
// A P1343 value matching extIdByAllowed is straightforwardly redundant.
// A P1343 value matching extIdByNotAllowed (but not extIdByAllowed) is a
// wrong/deprecated stated-in for a source already implied by the external-id.
const extIdByAllowed = new Map();
const extIdByNotAllowed = new Map();
for (const [extIdPid, extIdClaims] of Object.entries(entity.claims || {})) {
const prefs = propertyStatedInCache.get(extIdPid);
if (!prefs?.allowed?.size) continue;
for (const extClaim of extIdClaims) {
if (extClaim.rank === "deprecated") continue;
const entry = { extIdPid, claim: extClaim, preferred: prefs.preferred };
for (const allowedQid of prefs.allowed) {
if (!extIdByAllowed.has(allowedQid))
extIdByAllowed.set(allowedQid, []);
extIdByAllowed.get(allowedQid).push(entry);
}
// Seed extIdByNotAllowed from the cache's precomputed notAllowed set
// (related-entity QIDs from P2378/P126/P10726/P1629/P98 that are not
// valid P9073 values), plus any not-allowed QIDs already present in
// existing references on this claim.
const seedNotAllowed = (qid) => {
if (!isQid(qid) || prefs.allowed.has(qid)) return;
if (!extIdByNotAllowed.has(qid)) extIdByNotAllowed.set(qid, []);
extIdByNotAllowed.get(qid).push(entry);
};
for (const qid of prefs.notAllowed || []) seedNotAllowed(qid);
for (const ref of extClaim.references || []) {
for (const siSnak of ref.snaks?.[PID_STATED_IN] || []) {
seedNotAllowed(siSnak?.datavalue?.value?.id);
}
}
}
}
for (const p1343Claim of describedByClaims) {
const sourceQid = p1343Claim.mainsnak?.datavalue?.value?.id;
if (!isQid(sourceQid)) continue;
// Prefer the allowed-match; fall back to not-allowed only when the QID is
// not an allowed value for any other property present on this item.
const candidateExtIds =
extIdByAllowed.get(sourceQid) ??
(!extIdByAllowed.has(sourceQid)
? extIdByNotAllowed.get(sourceQid)
: undefined);
if (!candidateExtIds?.length) continue;
const hasQuals = !hasNoQualifiers(p1343Claim);
const refs = p1343Claim.references || [];
// Safety: more than one reference on the P1343 claim -> too ambiguous, skip.
if (refs.length > 1) continue;
// ── Resolve which ext-id claim to target ──────────────────────────────
//
// When there is only one candidate we use it directly.
// When there are multiple candidates (e.g. two P243 claims) we must
// identify the correct one from the P1343 reference content:
//
// 1. The ref contains an external-id snak whose value matches a candidate.
// 2. The ref contains a URL (P854/P2699); matchUrlAgainstPatterns can
// extract an ID that matches a candidate.
// 3. The ref URL contains the candidate's ext-id value as a substring
// (e.g. worldcat URL containing the OCLC number).
//
// If none of these resolve to exactly one candidate, skip this P1343 claim.
let targetEntry = null;
if (candidateExtIds.length === 1) {
targetEntry = candidateExtIds[0];
} else {
// Multiple candidates — attempt disambiguation using the P1343 reference.
const ref = refs[0]; // at most one ref (guarded above)
const refSnaks = ref?.snaks || {};
// Strategy 1: ref contains an ext-id snak whose value matches a candidate.
const extIdInRef = new Map(); // pid -> value
for (const rPid of Object.keys(refSnaks)) {
const snak = refSnaks[rPid]?.[0];
if (
snak?.datatype === "external-id" &&
typeof snak.datavalue?.value === "string"
) {
extIdInRef.set(rPid, snak.datavalue.value);
}
}
if (extIdInRef.size === 1) {
const [rPid, rVal] = [...extIdInRef.entries()][0];
const matched = candidateExtIds.filter(
(e) =>
e.extIdPid === rPid &&
e.claim.mainsnak?.datavalue?.value === rVal,
);
if (matched.length === 1) targetEntry = matched[0];
} else if (extIdInRef.size > 1) {
// Multiple ext-ids in the ref — too ambiguous.
continue;
}
// Strategy 2 & 3: use a URL from the ref.
if (!targetEntry) {
const urlSnak =
(refSnaks[PID_REFERENCE_URL] || [])[0] ??
(refSnaks[PID_URL] || [])[0];
const urlValue = urlSnak?.datavalue?.value;
if (typeof urlValue === "string") {
// Strategy 2: pattern-match the URL to extract an ext-id value.
if (!targetEntry) {
const matchResult = matchUrlAgainstPatternsWithCleanup(urlValue);
if (matchResult.matched && matchResult.extractedId) {
const matched = candidateExtIds.filter(
(e) =>
e.extIdPid === matchResult.suggestedProperty &&
e.claim.mainsnak?.datavalue?.value ===
matchResult.extractedId,
);
if (matched.length === 1) targetEntry = matched[0];
}
}
// Strategy 3: the candidate's ext-id value appears verbatim in the URL.
if (!targetEntry) {
const matched = candidateExtIds.filter((e) => {
const val = e.claim.mainsnak?.datavalue?.value;
return (
typeof val === "string" &&
val.length > 0 &&
urlValue.includes(val)
);
});
if (matched.length === 1) targetEntry = matched[0];
}
}
}
if (!targetEntry) continue; // could not unambiguously identify the target
}
const { extIdPid, claim: extIdClaim } = targetEntry;
const isSimple = !hasQuals && refs.length === 0;
if (isSimple) {
// Simple variant: just remove the P1343 claim, nothing to migrate
diffs.push({
action: ACTION_REMOVE_CLAIM,
pid: PID_DESCRIBED_BY_SOURCE,
claimId: p1343Claim.id,
value: p1343Claim.mainsnak?.datavalue,
keepClaimId: extIdClaim.id,
keepValue: extIdClaim.mainsnak?.datavalue,
});
} else {
// Complex variant: remove P1343 and migrate its references (with any
// P1343 qualifiers injected as additional snaks) to the ext-id claim.
diffs.push({
action: ACTION_ABSORB_CLAIM,
pid: PID_DESCRIBED_BY_SOURCE,
claimId: p1343Claim.id,
value: p1343Claim.mainsnak?.datavalue,
extIdPid,
extIdClaimId: extIdClaim.id,
keepValue: extIdClaim.mainsnak?.datavalue,
// Snapshot qualifiers at detect-time so the apply step can embed them
qualifiers: JSON.parse(JSON.stringify(p1343Claim.qualifiers || {})),
qualifiersOrder: (p1343Claim["qualifiers-order"] || []).slice(),
refHashes: refs.map((r) => r.hash),
});
}
}
return diffs;
}
/**
* Detect URL-type claims (e.g. P973 described at URL, P856 official website, P2699 URL)
* whose value can be matched by matchUrlAgainstPatterns to an external-ID property,
* and where a non-deprecated claim for that ext-id property with the extracted value
* already exists on the same item.
*
* When found, the URL claim is redundant — its qualifiers and references can be
* absorbed into the matching ext-id claim just like absorbDescribedBySource.
* The same ACTION_ABSORB_DESCRIBED_BY_SOURCE apply logic is reused; the URL claim
* plays the role of the P1343 claim.
*
* URL properties considered: P973 (described at URL), P856 (official website),
* P2699 (URL), and any other URL-datatype property present on the item.
*
* Guards:
* - URL claim rank is not deprecated.
* - URL claim has at most one reference (multiple refs -> skip).
* - The extracted ext-id value matches exactly one non-deprecated claim on the item.
*/
async function detectAbsorbUrlClaim(entity) {
const diffs = [];
// Collect all URL-datatype claims on the item, grouped by PID.
// We consider every property whose mainsnak datatype is "url".
const urlClaims = [];
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
if (claim.rank === "deprecated") continue;
if (claim.mainsnak?.datatype !== "url") continue;
const urlValue = claim.mainsnak?.datavalue?.value;
if (typeof urlValue !== "string" || !urlValue) continue;
urlClaims.push({ pid, claim, urlValue });
}
}
if (!urlClaims.length) return diffs;
// Build a lookup: extIdPid -> Map<value, claim> for all non-deprecated ext-id claims.
const extIdIndex = new Map(); // extIdPid -> Map<value, claim>
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
if (claim.rank === "deprecated") continue;
if (claim.mainsnak?.datatype !== "external-id") continue;
const val = claim.mainsnak?.datavalue?.value;
if (typeof val !== "string") continue;
if (!extIdIndex.has(pid)) extIdIndex.set(pid, new Map());
// Keep first non-deprecated claim for each value (duplicates handled elsewhere)
if (!extIdIndex.get(pid).has(val)) extIdIndex.get(pid).set(val, claim);
}
}
if (!extIdIndex.size && !DEV_ABSORB_URL_CLAIM_CREATE_MISSING) return diffs;
for (const { pid, claim: urlClaim, urlValue } of urlClaims) {
// Safety: more than one reference -> too ambiguous, skip.
const refs = urlClaim.references || [];
// not so relevant here
// if (refs.length > 1) continue;
// Match the URL against cached patterns.
const matchResult = matchUrlAgainstPatternsWithCleanup(urlValue);
if (!matchResult.matched || !matchResult.extractedId) continue;
const { suggestedProperty: extIdPid, extractedId } = matchResult;
// Helper: check whether the item's P31 (instance of) matches any of the
// given QIDs.
const itemIsA = (qids) =>
qids.some((qid) =>
(entity.claims?.["P31"] || []).some(
(c) => c.mainsnak?.datavalue?.value?.id === qid,
),
);
// Platform-pair check: if the URL resolves to a "content item" property
// and the item has the corresponding "creator" property for the same
// platform, remove the URL claim (it points at a content item, not the
// subject). Emit ACTION_REMOVE_CLAIM with the creator claim as the reason.
const platformPair = ABSORB_URL_CLAIM_PLATFORM_PAIRS[extIdPid];
if (platformPair && itemIsA(platformPair.itemTypes)) {
const creatorClaims = (
entity.claims?.[platformPair.creatorPid] || []
).filter((c) => c.rank !== "deprecated");
if (creatorClaims.length) {
//const hasQuals = !hasNoQualifiers(urlClaim);
//const isSimple = !hasQuals && refs.length === 0;
//if (isSimple) {
diffs.push({
action: ACTION_REMOVE_CLAIM,
pid,
claimId: urlClaim.id,
value: urlClaim.mainsnak?.datavalue,
keepClaimId: creatorClaims[0].id,
keepValue: creatorClaims[0].mainsnak?.datavalue,
});
//}
// Complex case (has qualifiers/refs): skip — too ambiguous to migrate
// to the creator claim automatically.
continue;
}
}
// Skip ext-id properties that are inappropriate for this item type and
// have no platform-pair remedy (e.g. IMDb title ID on a human item).
if (
ABSORB_URL_CLAIM_SKIP[extIdPid] &&
itemIsA(ABSORB_URL_CLAIM_SKIP[extIdPid])
)
continue;
// Check whether the extracted ext-id value exists on the item.
const extIdClaim = extIdIndex.get(extIdPid)?.get(extractedId);
if (!extIdClaim) {
// Normal mode: ext-id must already exist on the item.
if (!DEV_ABSORB_URL_CLAIM_CREATE_MISSING) continue;
// Dev mode: propose converting the URL claim into a new ext-id statement,
// carrying over its qualifiers and references.
diffs.push({
action: ACTION_CONVERT_URL_TO_EXT_ID,
pid,
claimId: urlClaim.id,
value: urlClaim.mainsnak?.datavalue,
extIdPid,
extractedId,
qualifiers: JSON.parse(JSON.stringify(urlClaim.qualifiers || {})),
qualifiersOrder: (urlClaim["qualifiers-order"] || []).slice(),
refHashes: refs.map((r) => r.hash),
});
continue;
}
// Don't absorb into the same claim (shouldn't happen, but guard anyway).
if (extIdClaim.id === urlClaim.id) continue;
const hasQuals = !hasNoQualifiers(urlClaim);
const isSimple = !hasQuals && refs.length === 0;
if (isSimple) {
diffs.push({
action: ACTION_REMOVE_CLAIM,
pid,
claimId: urlClaim.id,
value: urlClaim.mainsnak?.datavalue,
keepClaimId: extIdClaim.id,
keepValue: extIdClaim.mainsnak?.datavalue,
});
} else {
diffs.push({
action: ACTION_ABSORB_CLAIM,
pid,
claimId: urlClaim.id,
value: urlClaim.mainsnak?.datavalue,
extIdPid,
extIdClaimId: extIdClaim.id,
keepValue: extIdClaim.mainsnak?.datavalue,
qualifiers: JSON.parse(JSON.stringify(urlClaim.qualifiers || {})),
qualifiersOrder: (urlClaim["qualifiers-order"] || []).slice(),
refHashes: refs.map((r) => r.hash),
});
}
}
return diffs;
}
function detectSelfCite(entity) {
const currentQid = mw.config.get("wgTitle");
if (!isQid(currentQid)) return [];
return (entity.claims?.[PID_CITES_WORK] || [])
.filter((claim) => claim.mainsnak?.datavalue?.value?.id === currentQid)
.map((claim) => makeRemoveClaimDiff(PID_CITES_WORK, claim, null));
}
// ==== 18 Whitelist for always-removable wikimedia refs ====================
const ALWAYS_REMOVE_WIKIMEDIA_PIDS = new Set([
"P301", // category's main topic
"P373", // Commons category
"P910", // topic's main category
"P971", // category combines topics
"P1200", // category for the water basin
"P1464", // category for people born here
"P1465", // category for people who died here
"P1740", // category for films shot at this location
"P1753", // list related to category
"P1754", // category related to list
"P1791", // category for people buried here
"P1792", // category of associated people
"P2033", // category for pictures taken with this camera
"P2517", // category for recipients of this award
"P2875", // property usage tracking category
"P3709", // category for value different from Wikidata
"P3713", // category for value not in Wikidata
"P3734", // category for value same as Wikidata
"P3876", // category for alumni of educational institution
"P4195", // category for employees of the organization
"P4224", // category contains
"P4329", // category populated by
"P5996", // category for films in this language
"P6112", // category for members of a team
"P6186", // category for eponymous categories
"P6365", // member category
"P7084", // related category
"P7561", // category for the interior of the item
"P7782", // category for ship name
"P7861", // category for files created with program
"P7867", // category for maps or plans
"P8464", // content partnership category
"P10280", // category for honorary citizens of entity
"P12686", // category for births in this time period
"P12687", // category for deaths in this time period
"P935", // Commons gallery
"P1472", // Commons Creator page
"P1612", // Commons Institution page
]);
// ==== 19 Reference-detector factory =======================================
/**
* Single-pass reference categorisation for all active ref-category detectors.
* Each ref is classified once; the resulting diff is placed into the bucket for
* its category key if that key is present in activeCategories.
* Returns a Map<categoryKey, diff[]>.
*/
/**
* Runs a single pass over all active source-category detectors (those backed
* by detectRefCategories rather than their own detect function), classifying
* each reference exactly once. Returns a Map<categoryKey, diff[]>.
*/
function detectRefCategories(entity, activeCategories) {
const results = new Map(activeCategories.map((k) => [k, []]));
for (const pid in entity.claims || {}) {
for (const c of entity.claims[pid]) {
if (!c.references?.length) continue;
const levels = c.references.map((r) =>
getReferenceLevel(entity, r, c.references, c),
);
const maxLevel = Math.max(...levels);
const strict = isStrictStatement(c);
for (let ri = 0; ri < c.references.length; ri++) {
const ref = c.references[ri];
const cat = determineSourceCategory(entity, ref, c.references, c);
if (!results.has(cat)) continue;
// Applies to Applies to
// Level check strict non-strict/ext-id
// wikimedia: Y* Y Y * Not for ALWAYS_REMOVE_WIKIMEDIA_PIDS
// aggregator: Y Y N
// community: Y Y N
// redundant: Y Y N
// inferred: Y Y N
// obsolete: Y Y N
// invalid: N - Y
// self_stated_in: N - Y
// wikimedia_no_sitelinks: N Y Y
const level = levels[ri];
const alwaysRemove =
(cat === "wikimedia" && ALWAYS_REMOVE_WIKIMEDIA_PIDS.has(pid)) ||
(cat === "invalid" && !strict) ||
cat === "wikimedia_no_sitelinks" ||
(cat === "self_stated_in" && !strict) ||
// redundant can always be removed; there is another reference that is not redundant
cat === "redundant";
const ignoreStrictCheck = cat === "wikimedia";
if (isSplittableReference(ref).splittable) continue;
if (!alwaysRemove) {
if (!ignoreStrictCheck && !strict) continue;
if (level >= maxLevel) continue;
}
const diff = {
action: ACTION_REMOVE_REFS,
pid,
claimId: c.id,
refHash: ref.hash,
removedKeys: Object.keys(ref.snaks),
};
if (cat === "wikimedia_no_sitelinks") {
diff.lang =
analyzeWikimediaReference(entity, ref).language || "unknown";
}
results.get(cat).push(diff);
}
}
}
return results;
}
function createReferenceDetector(categoryKey) {
const isNoSitelinks = categoryKey === "wikimedia_no_sitelinks";
return {
label: categoryKey,
headers: isNoSitelinks
? ["property", "lang", "removedValues"]
: ["property", "removedValues"],
isRemoveRefCategory: true,
summaryLabel: categoryKey,
// detect is null; generatePreviewDiffs runs detectRefCategories as a shared
// single pass over all active ref-category detectors instead.
detect: null,
renderRow(row, labels) {
const propLink = renderLink({
id: row.pid,
claimId: row.claimId,
labels,
});
const resolvedRemoved = resolvePidList(row.removedKeys, labels);
return isNoSitelinks
? [
propLink,
renderLangCode(row.lang, { hintOnly: true }),
resolvedRemoved,
]
: [propLink, resolvedRemoved];
},
};
}
/**
* Detect references that contain a P854 (reference URL) value which is already
* present in the statement itself — either as the mainsnak value (when the
* statement property is a URL-datatype property) or as a URL-datatype qualifier.
*
* Two outcomes are possible for each matching reference:
* - If P854 is the only meaningful snak in the reference (i.e. after removing
* P854 no snaks remain, or only metadata-only snaks such as P813/P1476/P1810
* remain), emit ACTION_REMOVE_REFS to delete the entire reference.
* - Otherwise emit ACTION_REMOVE_REDUNDANT_REF_URL to strip only the P854 snak
* while keeping the rest of the reference intact.
*
* Only non-deprecated claims are considered.
* References that are about to be split (isSplittableReference) are skipped.
*/
function detectRemoveRedundantRefUrl(entity) {
const diffs = [];
// Snaks that are considered "metadata only" and do not count as real content
// when deciding whether to drop the entire reference vs. just the P854 snak.
const METADATA_PIDS = new Set([
PID_RETRIEVED,
PID_TITLE,
PID_SUBJECT_NAMED_AS,
]);
// Normalise a raw URL for comparison, matching the logic in detectDuplicateUrlClaims:
// percent-decode, strip tracking/functional params (recognition mode), strip trailing slash,
// and strip a leading "www." from the hostname so that e.g.
// "https://www.example.com" and "https://example.com" are treated as equal.
function normForCompare(raw) {
if (typeof raw !== "string" || !raw) return null;
try {
const { url: cleaned } = cleanUrl(normalizeUrl(raw), {
recognitionMode: true,
});
const withoutSlash = removeTrailingSlash(cleaned);
// Strip www. from hostname for comparison only
const u = new URL(withoutSlash);
if (u.hostname.startsWith("www.")) {
u.hostname = u.hostname.slice(4);
}
return removeTrailingSlash(u.href);
} catch {
return raw;
}
}
for (const pid in entity.claims || {}) {
for (const claim of entity.claims[pid]) {
if (claim.rank === "deprecated") continue;
// Collect normalised URL values that already appear in the statement:
// 1. Mainsnak value when the statement property is a URL type.
// 2. Any URL-datatype qualifier value.
const statementUrls = new Set();
if (claim.mainsnak?.datatype === "url") {
const norm = normForCompare(claim.mainsnak?.datavalue?.value);
if (norm) statementUrls.add(norm);
}
for (const qPid in claim.qualifiers || {}) {
for (const snak of claim.qualifiers[qPid]) {
if (snak.datatype === "url") {
const norm = normForCompare(snak.datavalue?.value);
if (norm) statementUrls.add(norm);
}
}
}
if (!statementUrls.size) continue;
for (const ref of claim.references || []) {
if (isSplittableReference(ref).splittable) continue;
const p854Snaks = ref.snaks?.[PID_REFERENCE_URL] || [];
// Find P854 snaks whose normalised value is already in the statement.
const redundantP854 = p854Snaks.filter((s) => {
const norm = normForCompare(s.datavalue?.value);
return norm !== null && statementUrls.has(norm);
});
if (!redundantP854.length) continue;
// Determine what remains in the reference after removing the redundant P854 snaks.
const remainingSnakPids = Object.keys(ref.snaks || {}).filter((p) => {
if (p !== PID_REFERENCE_URL) return true;
// Keep P854 if there are non-redundant P854 snaks left.
const kept = p854Snaks.filter((s) => !redundantP854.includes(s));
return kept.length > 0;
});
const hasRealContent = remainingSnakPids.some(
(p) => !METADATA_PIDS.has(p),
);
if (!hasRealContent) {
// Remove the entire reference — nothing meaningful remains.
diffs.push({
action: ACTION_REMOVE_REFS,
pid,
claimId: claim.id,
refHash: ref.hash,
removedKeys: Object.keys(ref.snaks),
// carry extra info for renderRow
_redundantRefUrl: redundantP854[0]?.datavalue?.value,
});
} else {
// Strip only the redundant P854 snak(s), keep the rest.
for (const s of redundantP854) {
diffs.push({
action: ACTION_REMOVE_REDUNDANT_REF_URL,
pid,
claimId: claim.id,
refHash: ref.hash,
snakHash: s.hash,
referenceUrl: s.datavalue?.value,
});
}
}
}
}
}
return diffs;
}
/**
* Detect URL-datatype claims whose value appears on more than one URL property
* on the same item (e.g. P856 official website and P973 described at URL both
* pointing to the same URL).
*
* The claim on the property with the *lower* PID number is treated as the
* canonical one ("keep"); claims on higher-numbered properties are absorbed
* into it using ACTION_ABSORB_DESCRIBED_BY_SOURCE (which migrates qualifiers
* and references, then removes the source claim).
*
* Only non-deprecated URL-datatype claims are considered.
* A URL value must appear on at least two different properties to trigger this.
*/
function detectDuplicateUrlClaims(entity) {
const diffs = [];
// Collect all non-deprecated URL claims, grouped by normalised URL value.
// urlMap: normalisedUrl -> [{pid, claim}]
const urlMap = new Map();
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
if (claim.rank === "deprecated") continue;
if (claim.mainsnak?.datatype !== "url") continue;
const rawVal = claim.mainsnak?.datavalue?.value;
if (typeof rawVal !== "string" || !rawVal) continue;
// Normalise for comparison: percent-decode, strip known tracking params
// (recognition mode also strips functional/UI params and trailing slash),
// then strip any remaining trailing slash.
// The original claim value is preserved unchanged.
const { url: cleaned } = cleanUrl(normalizeUrl(rawVal), {
recognitionMode: true,
});
const normVal = removeTrailingSlash(cleaned);
if (!urlMap.has(normVal)) urlMap.set(normVal, []);
urlMap.get(normVal).push({ pid, claim });
}
}
for (const [, entries] of urlMap) {
// Only act when the same URL appears on 2+ distinct properties.
const byPid = new Map();
for (const e of entries) {
if (!byPid.has(e.pid)) byPid.set(e.pid, []);
byPid.get(e.pid).push(e.claim);
}
if (byPid.size < 2) continue;
// Sort PIDs numerically ascending; lowest PID = canonical target.
const sortedPids = [...byPid.keys()].sort(
(a, b) =>
parseInt(a.replace("P", ""), 10) - parseInt(b.replace("P", ""), 10),
);
const keepPid = sortedPids[0];
// Prefer a non-deprecated, non-empty claim on the keep PID.
const keepClaim = byPid.get(keepPid)[0];
for (const removePid of sortedPids.slice(1)) {
for (const removeClaim of byPid.get(removePid)) {
const hasQuals = !hasNoQualifiers(removeClaim);
const refs = removeClaim.references || [];
const isSimple = !hasQuals && refs.length === 0;
if (isSimple) {
diffs.push({
action: ACTION_REMOVE_CLAIM,
pid: removePid,
claimId: removeClaim.id,
value: removeClaim.mainsnak?.datavalue,
keepClaimId: keepClaim.id,
keepValue: keepClaim.mainsnak?.datavalue,
});
} else {
diffs.push({
action: ACTION_ABSORB_CLAIM,
pid: removePid,
claimId: removeClaim.id,
value: removeClaim.mainsnak?.datavalue,
extIdPid: keepPid,
extIdClaimId: keepClaim.id,
keepValue: keepClaim.mainsnak?.datavalue,
qualifiers: JSON.parse(
JSON.stringify(removeClaim.qualifiers || {}),
),
qualifiersOrder: (removeClaim["qualifiers-order"] || []).slice(),
refHashes: refs.map((r) => r.hash),
});
}
}
}
}
return diffs;
}
/**
* Normalise a raw URL for blocklist matching.
* Strips the scheme (https?://) and a leading "www." so that blocklist prefix
* rules such as "about.me" match both "https://about.me/..." and
* "https://www.about.me/...".
*/
function normalizeUrlForBlocklist(rawUrl) {
if (typeof rawUrl !== "string") return "";
return rawUrl.replace(/^https?:\/\//i, "").replace(/^www\./i, "");
}
/**
* Test a single (normalised) URL against one blocklist rule.
* Returns true when the URL matches.
*/
function urlMatchesBlocklistRule(normUrl, rule) {
if (rule.matchType === "prefix") {
return normUrl.startsWith(rule.pattern);
}
if (rule.matchType === "regex" && rule.compiledRegex) {
return rule.compiledRegex.test(normUrl);
}
return false;
}
/**
* Detect URL-datatype claims (mainsnak) whose value matches a rule in the
* URL deprecation blocklist ([[User:Difool/URL-deprecation-blocklist]]).
*
* Two outcomes per matching claim:
* action "remove" -> ACTION_REMOVE_CLAIM (the "URLs to remove" section)
* action "deprecate" -> ACTION_DEPRECATE_URL_CLAIM (all other sections)
*
* Only non-deprecated claims are considered; claims already at deprecated rank
* are skipped regardless of action.
*
* The diff carries `sectionLabel` so the preview table can show which blocklist
* section matched (e.g. "Self-created profile platforms").
*/
function detectBlocklistedUrlClaims(entity) {
const diffs = [];
const rules = urlBlocklistCache.rules;
if (!rules.length) return diffs;
for (const [pid, claims] of Object.entries(entity.claims || {})) {
for (const claim of claims) {
if (claim.rank === "deprecated") continue;
if (claim.mainsnak?.datatype !== "url") continue;
const rawVal = claim.mainsnak?.datavalue?.value;
if (typeof rawVal !== "string" || !rawVal) continue;
const normUrl = normalizeUrlForBlocklist(rawVal);
for (const rule of rules) {
if (!urlMatchesBlocklistRule(normUrl, rule)) continue;
if (rule.action === "remove") {
diffs.push({
action: ACTION_REMOVE_CLAIM,
pid,
claimId: claim.id,
value: claim.mainsnak?.datavalue,
sectionLabel: rule.sectionLabel,
});
} else {
// "deprecate"
diffs.push({
action: ACTION_DEPRECATE_URL_CLAIM,
pid,
claimId: claim.id,
value: claim.mainsnak?.datavalue,
sectionLabel: rule.sectionLabel,
deprecationReason: rule.deprecationReason || null,
});
}
break; // first matching rule wins
}
}
}
return diffs;
}
/**
* Detect human items where the en, de, and fr labels all exist, are
* identical after normalisation, no mul label has been set yet, and the
* shared value contains only Latin-script characters.
*
* Guards:
* - Item must have P31 (instance of) = Q5 (human).
* - labels.mul must be absent (undefined or empty string).
* - en, de, and fr labels must all be present.
* - After normaliseText() they must all be equal.
* - The resulting string must consist solely of Latin-script characters,
* digits, whitespace, punctuation and combining marks — no Cyrillic,
* Arabic, Hebrew, CJK, Devanagari, Greek, etc.
*
* Emits a single diff of action ACTION_SET_MUL_LABEL.
*/
function detectAddMulLabel(entity) {
const diffs = [];
// Only for humans
const isHuman = (entity.claims?.P31 || []).some(
(c) => c.mainsnak?.datavalue?.value?.id === "Q5",
);
if (!isHuman) return diffs;
// No mul label yet
if (entity.labels?.mul?.value) return diffs;
// en, de, fr must all be present
const required = ["en", "de", "fr"];
const rawValues = required.map((lang) => entity.labels?.[lang]?.value);
if (rawValues.some((v) => !v)) return diffs;
// Normalise and require all three to be equal
const normalised = rawValues.map((v) => normalizeText(v));
if (normalised.some((v) => v !== normalised[0])) return diffs;
const labelValue = normalised[0];
if (!labelValue) return diffs;
// Must be Latin-script only (allow Latin chars, numbers, separators,
// punctuation and combining marks — reject Cyrillic, Arabic, CJK, etc.)
if (!/^[\p{Script=Latin}\p{N}\p{Z}\p{P}\p{M}]+$/u.test(labelValue))
return diffs;
diffs.push({
action: ACTION_SET_MUL_LABEL,
value: labelValue,
matchingLangs: required.join(", "),
});
return diffs;
}
/**
* Detect alias values that appear identically in more than 5 languages and
* are not yet present as a mul alias.
*
* For each such value, propose:
* 1. (visible) ACTION_ADD_MUL_ALIAS — add the value to aliases.mul.
* 2. (hidden) ACTION_REMOVE_ALIAS — remove it from each source language.
*
* The hidden diffs share the same rowId as the visible one, so unchecking a
* row in the preview suppresses both the addition and all removals together.
*
* Guards:
* - "mul" language itself is never treated as a source language.
* - Values already present in aliases.mul (normalised) are skipped.
* - Values equal to the mul label (normalised) are skipped — they would
* just become alias = label in mul, which is redundant.
* - Threshold is strictly >5 (i.e. 6 or more languages).
* - Works for all item types (not restricted to humans).
*/
function detectAddMulAlias(entity) {
const diffs = [];
// Build the set of values already in aliases.mul (normalised).
const mulAliasNorms = new Set(
(entity.aliases?.mul || []).map((a) => normalizeText(a.value)),
);
// Also skip values equal to the current mul label.
const mulLabelNorm = entity.labels?.mul?.value
? normalizeText(entity.labels.mul.value)
: null;
// Collect alias values across all non-mul languages:
// normalisedValue -> { original: string, langs: Map<lang, originalValue> }
// We keep lang->originalValue so the remove diff uses the exact stored string.
const valueMap = new Map();
for (const lang in entity.aliases || {}) {
if (lang === "mul") continue;
for (const alias of entity.aliases[lang]) {
const norm = normalizeText(alias.value);
if (!norm) continue;
if (mulAliasNorms.has(norm)) continue;
if (mulLabelNorm && norm === mulLabelNorm) continue;
if (!valueMap.has(norm)) {
valueMap.set(norm, { original: alias.value, langs: new Map() });
}
// Store the first seen original spelling; track all source languages.
valueMap.get(norm).langs.set(lang, alias.value);
}
}
for (const [norm, { original, langs }] of valueMap) {
if (langs.size <= 5) continue; // threshold: strictly more than 5
const rowId = `addMulAlias_${norm}`;
const sourceLangs = [...langs.keys()];
// Visible diff: add the value as a mul alias.
diffs.push({
action: ACTION_ADD_MUL_ALIAS,
rowId,
value: original,
sourceLangs,
langCount: langs.size,
});
// Hidden diffs: remove the value from each source language.
for (const [lang, origValue] of langs) {
diffs.push({
action: ACTION_REMOVE_ALIAS,
_hidden: true,
rowId,
lang,
value: origValue,
});
}
}
return diffs;
}
/**
* Detect labels in language A (≠ 'en') that are identical to the English label
* and have been copied from English rather than being the native-language form.
*
* Evidence of a bad copy: the item has a {A}wiki sitelink whose title, after
* stripping any trailing parenthetical disambiguation suffix, differs from the
* current label in language A.
*
* Guards:
* - Language A must not be 'en'.
* - The English label must be present and non-empty.
* - The sitelink title, after stripping the parenthetical, must be non-empty
* and must differ from the current label in language A.
* - The stripped title must also differ from the English label — if the
* Wikipedia title is itself the same as the English label there is nothing
* to fix.
*
* Emits ACTION_NORMALIZE with field "label", handled by the existing apply step.
*/
function detectFixCopiedLabels(entity) {
const diffs = [];
// Only run on humans unless the DEV flag is set, because Wikipedia article
// titles are always capitalised and may contain comma-based geographic
// qualifications (e.g. "Springfield, Illinois") that are hard to normalise
// safely for non-human items.
if (!DEV_FIX_COPIED_LABELS_ALL_TYPES) {
const isHuman = (entity.claims?.P31 || []).some(
(c) => c.mainsnak?.datavalue?.value?.id === "Q5",
);
if (!isHuman) return diffs;
}
const enLabel = entity.labels?.en?.value;
if (!enLabel) return diffs;
const enNorm = normalizeText(enLabel);
if (!enNorm) return diffs;
// The mul label being identical to the English label is the signal that
// this item is in the "copied English everywhere" pattern, which makes
// absent per-language labels eligible for filling from the Wikipedia title.
const mulNorm = normalizeText(entity.labels?.mul?.value || "");
const mulMatchesEn = mulNorm === enNorm;
const sitelinks = entity.sitelinks || {};
for (const lang in sitelinks) {
// Only process {lang}wiki sitelinks.
if (!lang.endsWith("wiki")) continue;
const langCode = lang.slice(0, -4);
if (langCode === "en") continue;
const sitelink = sitelinks[lang];
if (!sitelink?.title) continue;
const current = entity.labels?.[langCode]?.value || "";
const currentNorm = normalizeText(current);
// Trigger condition:
// (a) label equals the English label (direct copy), or
// (b) label is absent/empty, mul label equals English label, and the
// language already has a description — meaning someone has worked
// on this language's data, so filling the label from Wikipedia is
// meaningful rather than noise.
const isDirect = currentNorm === enNorm;
const isEmpty = !currentNorm;
const hasDescription = !!entity.descriptions?.[langCode]?.value;
if (!isDirect && !(isEmpty && mulMatchesEn && hasDescription)) continue;
// Strip trailing parenthetical disambiguation.
const stripped = sitelink.title.replace(/\s*\([^()]*\)\s*$/, "").trim();
if (!stripped) continue;
// Normalize apostrophe variants to a plain apostrophe for comparison only.
// This avoids false-positive diffs where the only difference is apostrophe style
// (e.g. "Mika'ela" vs "Mika'ela").
function normalizeApostrophes(s) {
return s.replace(/[\u2018\u2019\u02BC\u0060\u00B4\u02B9]/g, "'");
}
const strippedNorm = normalizeApostrophes(normalizeText(stripped));
// No change if the stripped title matches the current label (or both empty).
if (strippedNorm === normalizeApostrophes(currentNorm)) continue;
// Skip if the stripped title is itself the same as the English label.
if (strippedNorm === normalizeApostrophes(enNorm)) continue;
diffs.push({
action: ACTION_NORMALIZE,
field: "label",
claimId: null,
lang: langCode,
before: current,
after: stripped,
sitelinkTitle: sitelink.title,
});
}
return diffs;
}
// ==== 20 Detector registry ================================================
function renderRemoveRow(row, labels) {
const propLink = renderLink({ id: row.pid, claimId: row.claimId, labels });
const removeLink = renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
});
if (!row.keepClaimId && !row.keepValue) return [propLink, removeLink];
const becauseOf = row.keepClaimId
? renderLink({
claimId: row.keepClaimId,
datavalue: row.keepValue,
labels,
})
: renderValue(row.keepValue, labels);
return [propLink, removeLink, becauseOf];
}
function renderDuplicateValueRow(row, labels, entity) {
const propLink = renderLink({
id: row.pid,
claimId: row.fromClaimId,
labels,
});
const from = findClaimById(entity, row.fromClaimId);
const to = findClaimById(entity, row.toClaimId);
return [
propLink,
renderLink({
claimId: row.fromClaimId,
datavalue: from?.mainsnak?.datavalue,
}),
renderLink({
claimId: row.toClaimId,
datavalue: to?.mainsnak?.datavalue,
}),
];
}
const detectors = {
...(DEV_WIKIMEDIA_NO_SITELINKS
? {
wikimedia_no_sitelinks: createReferenceDetector(
"wikimedia_no_sitelinks",
),
}
: {}),
wikimedia: createReferenceDetector("wikimedia"),
aggregator: createReferenceDetector("aggregator"),
community: createReferenceDetector("community"),
redundant: createReferenceDetector("redundant"),
inferred: createReferenceDetector("inferred"),
obsolete: createReferenceDetector("obsolete"),
invalid: createReferenceDetector("invalid"),
self_stated_in: createReferenceDetector("self_stated_in"),
removeObsoleteSnaks: {
label: "partial_obsolete",
headers: ["property", "removedValues"],
isRemoveRefCategory: false,
summaryLabel: "remove obsolete snaks from references",
detect: detectObsoleteSnaksInReferences,
renderRow(row, labels) {
return [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
resolvePidList(row.removedKeys, labels),
];
},
},
normalizeLabels: {
label: "normalizeLabels",
headers: ["field", "lang", "original", "normalized"],
isRemoveRefCategory: false,
summaryLabel: "normalize text",
detect: detectNormalizeLabels,
renderRow: (row) => [
row.field,
renderLangCode(row.lang, { hintOnly: true }),
visualizeInvisibleChars(row.before),
visualizeInvisibleChars(row.after),
],
},
removeAliasEqualsLabel: {
label: "removeAliasEqualsLabel",
headers: ["lang", "aliasLabel", "context"], // add a "context" column
summaryLabel: "remove alias=label/mul",
isRemoveRefCategory: false,
detect: detectRemoveDuplicateAliases,
renderRow: (row) => [
renderLangCode(row.lang),
visualizeInvisibleChars(row.value),
row.reason === "alias_equals_mul_label"
? "= mul label"
: row.reason === "alias_equals_mul_alias"
? "= mul alias"
: row.reason === "duplicate"
? "duplicate"
: "= label", // alias_equals_label
],
},
removeLowPrecisionDates: {
label: "removeLowPrecisionDates",
headers: ["property", "removed", "becauseOf"],
isRemoveRefCategory: false,
summaryLabel: "remove redundant dates",
detect: detectLowPrecisionDates,
renderRow: renderRemoveRow,
},
removeRedundantPreferred: {
label: "removeRedundantPreferred",
headers: ["property", "p7452"],
isRemoveRefCategory: false,
summaryLabel: "downgrade preferred ranks",
detect: detectRedundantPreferred,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
row.removedQualifier ? getMsg("yes") : getMsg("no"),
],
},
removeExpiredPreferred: {
label: "removeExpiredPreferred",
headers: ["property", "p7452"],
isRemoveRefCategory: false,
summaryLabel: "downgrade expired preferred ranks",
detect: detectExpiredPreferred,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
row.removedQualifier ? getMsg("yes") : getMsg("no"),
],
},
removeEmptyEndTime: {
label: "removeEmptyEndTime",
headers: ["property", "value"],
isRemoveRefCategory: false,
summaryLabel: "remove empty end time qualifiers",
detect: detectEmptyEndTime,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
row.qualifierValue ? JSON.stringify(row.qualifierValue) : "(no value)",
],
},
redundantCitizenshipDates: {
label: "redundantCitizenshipDates",
headers: ["property", "value", "becauseOf"],
isRemoveRefCategory: false,
summaryLabel: "remove redundant citizenship start/end dates",
detect: detectRedundantCitizenshipDates,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
renderValue(row.qualifierValue, labels),
renderEntity(row.matchedPid, labels),
],
},
mergeSameDateClaims: {
label: "mergeSameDateClaims",
headers: ["property", "value", "mergedInto"],
summaryLabel: "merge same-date claims",
isRemoveRefCategory: false,
detect: detectMergeSameDateClaims,
renderRow: renderDuplicateValueRow,
},
replaceWrongProperty: {
label: "replaceWrongProperty",
headers: ["property", "context", "oldProperty", "newProperty"],
summaryLabel: "replace property",
isRemoveRefCategory: false,
detect: detectWrongPropertyClaims,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
getMsg(row.context),
renderEntity(row.oldProperty, labels),
renderEntity(row.newProperty, labels),
],
},
moveRetrievedFromExternalId: {
label: "moveRetrievedFromExternalId",
headers: ["property", "value"],
isRemoveRefCategory: false,
summaryLabel: "move P813 from external-id to reference",
detect: detectMoveRetrievedFromExternalId,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
renderValue(row.qualifierValue),
],
},
duplicateValues: {
label: "duplicateValues",
headers: ["property", "value", "mergedInto"],
summaryLabel: "merge duplicate values",
isRemoveRefCategory: false,
detect: detectDuplicateValues,
renderRow: renderDuplicateValueRow,
},
dupRetrieved: {
label: "dupRetrieved",
headers: ["property", "removedValues"],
isRemoveRefCategory: true,
summaryLabel: "duplicate",
detect: detectDuplicateRefs,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
resolvePidList(row.removedKeys, labels),
],
},
removeRedundantOccupation: {
label: "removeRedundantOccupation",
headers: ["property", "removed", "becauseOf"],
isRemoveRefCategory: false,
summaryLabel: "remove redundant occupations",
detect: detectRedundantOccupation,
renderRow: renderRemoveRow,
},
removeJulianGregorianDates: {
label: "removeJulianGregorianDates",
headers: ["property", "removed", "becauseOf"],
isRemoveRefCategory: false,
summaryLabel: "remove Julian/Gregorian duplicate dates",
detect: detectJulianGregorianDuplicateDates,
renderRow: renderRemoveRow,
},
upgradePreciseDate: {
label: "upgradePreciseDate",
headers: ["property", "value", "becauseOf"],
isRemoveRefCategory: false,
summaryLabel: "upgrade precise date to preferred rank",
detect: detectUpgradePreciseDate,
renderRow: (row, labels) => {
// Only the ACTION_UPGRADE_PRECISE_DATE half is shown in the table;
// the companion ACTION_DOWNGRADE_PREFERRED diff shares the same rowId
// group and is applied automatically.
const propLink = renderLink({
id: row.pid,
claimId: row.claimId,
labels,
});
const preciseLink = renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
});
const lessLink = row.deprClaimId
? renderLink({
claimId: row.deprClaimId,
datavalue: row.deprValue,
labels,
})
: "";
return [propLink, preciseLink, lessLink];
},
},
convertWikipediaStatedIn: {
label: "convertWikipediaStatedIn",
headers: ["property", "context", "oldProperty", "newProperty"],
summaryLabel: "replace property",
isRemoveRefCategory: false,
detect: detectConvertWikipediaStatedIn,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
getMsg(row.context),
renderEntity(row.oldProperty, labels),
renderEntity(row.newProperty, labels),
],
},
convertInvalidStatedInReference: {
label: "convertInvalidStatedInReference",
headers: ["property", "externalIdProperty", "oldValue", "newValue"],
summaryLabel: "fix invalid 'stated in' in reference",
isRemoveRefCategory: false,
requiresLargeBuffers: true,
detect: detectInvalidStatedInReference,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
renderPidLink(
row.externalIdPid,
renderEntity(row.externalIdPid, labels),
),
renderQidLink(row.oldValue, renderEntity(row.oldValue, labels)),
renderQidLink(row.newValue, renderEntity(row.newValue, labels)),
],
},
mismatchedWikimediaImport: {
label: "mismatchedWikimediaImport",
headers: ["property", "referenceUrl", "oldValue", "newValue"],
summaryLabel: "fix mismatched P143 vs P4656",
isRemoveRefCategory: false,
detect: detectMismatchedWikimediaImport,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
formatUrlForDisplay(row.importUrl || ""),
renderQidLink(row.oldValue, renderEntity(row.oldValue, labels)),
renderQidLink(row.newValue, renderEntity(row.newValue, labels)),
],
},
removeIdDescriptions: {
label: "removeIdDescriptions",
headers: ["description", "idPresent"],
summaryLabel: "remove ID-style descriptions",
isRemoveRefCategory: false,
detect: detectIdDescriptions,
renderRow: (row) => [
row.before,
row.idPresent ? getMsg("yes") : getMsg("no"),
],
},
addExternalIdToReference: {
label: ACTION_ADD_EXTERNAL_ID_TO_REFERENCE,
headers: [
"property",
"suggestedProperty",
"extractedId",
"keepUrl",
"referenceUrl",
],
summaryLabel: "add external ID to reference",
isRemoveRefCategory: false,
detect: detectAddExternalIdToReference,
requiresLargeBuffers: true,
requiresHeavyComputing: true,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
renderEntity(row.suggestedProperty, labels),
row.extractedId,
row.keepUrl ? getMsg("yes") : getMsg("no"),
formatUrlForDisplay(row.referenceUrl),
],
},
splitMultipleReferenceUrls: {
label: "splitMultipleReferenceUrls",
headers: ["property", "count"],
isRemoveRefCategory: false,
summaryLabel: "split multiple reference URLs",
detect: detectMultipleReferenceUrls,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
String(row.urlCount),
],
},
cleanUrls: {
label: "cleanUrls",
headers: ["property", "original", "normalized"],
isRemoveRefCategory: false,
summaryLabel: "clean URLs",
detect: detectCleanUrls,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
formatUrlForDisplay(row.before),
formatUrlForDisplay(row.after),
],
},
removeSelfCite: {
label: "removeSelfCite",
headers: ["property", "removed"],
isRemoveRefCategory: false,
summaryLabel: "remove self-citation",
detect: detectSelfCite,
renderRow: renderRemoveRow,
},
absorbDescribedBySource: {
label: "absorbDescribedBySource",
headers: ["property", "removed", "externalIdClaim"],
isRemoveRefCategory: false,
requiresLargeBuffers: true,
summaryLabel: "remove redundant described-by-source",
detect: detectAbsorbDescribedBySource,
renderRow: (row, labels) => {
const propLink = renderLink({
id: row.pid,
claimId: row.claimId,
labels,
});
const removeLink = renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
});
const extIdLink = row.extIdClaimId
? renderLink({ id: row.extIdPid, claimId: row.extIdClaimId, labels })
: row.keepClaimId
? renderLink({
claimId: row.keepClaimId,
datavalue: row.keepValue,
labels,
})
: renderValue(row.keepValue, labels);
return [propLink, removeLink, extIdLink];
},
},
absorbUrlClaim: {
label: "absorbUrlClaim",
headers: ["urlClaim", "removed", "externalIdClaim"],
isRemoveRefCategory: false,
requiresLargeBuffers: true,
summaryLabel: "remove redundant URL claim",
detect: detectAbsorbUrlClaim,
renderRow: (row, labels) => {
const propLink = renderLink({
id: row.pid,
claimId: row.claimId,
labels,
});
const removeLink = renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
isUrl: true,
});
const extIdLink = row.extIdClaimId
? renderLink({ id: row.extIdPid, claimId: row.extIdClaimId, labels })
: row.keepClaimId
? renderLink({
claimId: row.keepClaimId,
datavalue: row.keepValue,
labels,
})
: renderValue(row.keepValue, labels);
return [propLink, removeLink, extIdLink];
},
},
convertUrlToExtId: {
label: "convertUrlToExtId",
headers: ["urlClaim", "removed", "newProperty", "extractedId"],
isRemoveRefCategory: false,
requiresLargeBuffers: true,
summaryLabel: "convert URL claim to external ID",
detect: () => [], // driven by detectAbsorbUrlClaim via DEV_ABSORB_URL_CLAIM_CREATE_MISSING
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
isUrl: true,
}),
renderEntity(row.extIdPid, labels),
row.extractedId,
],
},
duplicateUrlClaims: {
label: "duplicateUrlClaims",
headers: ["property", "removed", "becauseOf"],
isRemoveRefCategory: false,
summaryLabel: "merge duplicate URL claims",
detect: detectDuplicateUrlClaims,
renderRow: (row, labels) => {
const propLink = renderLink({
id: row.pid,
claimId: row.claimId,
labels,
});
const removeLink = renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
isUrl: true,
});
const keepLink = row.extIdClaimId
? renderLink({ id: row.extIdPid, claimId: row.extIdClaimId, labels })
: row.keepClaimId
? renderLink({
claimId: row.keepClaimId,
datavalue: row.keepValue,
labels,
})
: renderValue(row.keepValue, labels);
return [propLink, removeLink, keepLink];
},
},
removeRedundantRefUrl: {
label: "removeRedundantRefUrl",
headers: ["property", "referenceUrl"],
isRemoveRefCategory: false,
summaryLabel: "remove redundant reference URL (P854)",
detect: detectRemoveRedundantRefUrl,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
formatUrlForDisplay(row.referenceUrl || row._redundantRefUrl || ""),
],
},
blocklistedUrlClaims: {
label: "blocklistedUrlClaims",
headers: ["property", "value", "blocklistAction", "blocklistReason"],
isRemoveRefCategory: false,
summaryLabel: "deprecate/remove blocklisted URL claims",
detect: detectBlocklistedUrlClaims,
renderRow: (row, labels) => [
renderLink({ id: row.pid, claimId: row.claimId, labels }),
renderLink({
claimId: row.claimId,
datavalue: row.value,
labels,
isUrl: true,
}),
row.action === ACTION_REMOVE_CLAIM ? "remove" : "deprecate",
row.sectionLabel || "",
],
},
addMulLabel: {
label: "addMulLabel",
headers: ["mulLabelValue", "mulLabelLanguages"],
isRemoveRefCategory: false,
summaryLabel: "add mul label",
detect: detectAddMulLabel,
renderRow: (row) => [row.value, row.matchingLangs],
},
addMulAlias: {
label: "addMulAlias",
headers: ["mulLabelValue", "mulAliasLangCount"],
isRemoveRefCategory: false,
summaryLabel: "add mul alias",
detect: detectAddMulAlias,
renderRow: (row) => [
row.value,
`${row.langCount} (${row.sourceLangs.slice(0, 8).join(", ")}${row.langCount > 8 ? ", …" : ""})`,
],
},
fixCopiedLabel: {
label: "fixCopiedLabel",
headers: [
"fixCopiedLabelLang",
"fixCopiedLabelBefore",
"fixCopiedLabelAfter",
],
isRemoveRefCategory: false,
summaryLabel: "fix copied labels",
detect: detectFixCopiedLabels,
renderRow: (row) => [
renderLangCode(row.lang),
row.before,
row.sitelinkTitle
? $("<a>")
.attr(
"href",
`https://${row.lang}.wikipedia.org/wiki/${encodeURIComponent(row.sitelinkTitle.replace(/ /g, "_"))}`,
)
.attr("target", "_blank")
.text(row.after)
: row.after,
],
},
};
// ==== 21 UI helpers =======================================================
function collectPropertyLabels() {
const map = {};
document
.querySelectorAll(".wikibase-statementgroupview")
.forEach((group) => {
const pid = group.getAttribute("data-property-id");
const labelEl = group.querySelector(
".wikibase-statementgroupview-property-label a",
);
if (pid && labelEl) map[pid] = labelEl.textContent.trim();
});
document.querySelectorAll(".wikibase-snakview-property a").forEach((a) => {
const title = a.getAttribute("title");
if (title?.startsWith("Property:")) {
const pid = title.replace("Property:", "");
if (!map[pid]) map[pid] = a.textContent.trim();
}
});
document
.querySelectorAll(
".wikibase-snakview-value a, .wikibase-statementview-value a",
)
.forEach((a) => {
let qid = null;
const title = a.getAttribute("title");
if (/^Q\d+$/.test(title)) {
qid = title;
} else {
const m = (a.getAttribute("href") || "").match(/\/wiki\/(Q\d+)/);
if (m) qid = m[1];
}
if (qid && !map[qid]) {
const text = a.textContent.trim();
if (text) map[qid] = text;
}
});
return map;
}
function resolvePidList(pidList, labels) {
return (pidList || [])
.map((pid) => (labels[pid] ? `${labels[pid]} (${pid})` : pid))
.join(", ");
}
/**
* Render a language code as a jQuery element.
* Default: <code>langCode</code> Name
* With hintOnly=true: <code title="Name">langCode</code> (name as tooltip)
* Falls back gracefully when no name is in wikipediaLangNamesCache.
*/
function renderLangCode(langCode, { hintOnly = false } = {}) {
const name = wikipediaLangNamesCache.get(langCode);
if (hintOnly) {
const span = $("<span>");
span.append($("<code>").text(langCode));
if (name) span.attr("title", name);
return span;
}
const span = $("<span>");
span.append($("<code>").text(langCode));
if (name) span.append(document.createTextNode(" " + name));
return span;
}
function renderEntity(id, labels = {}) {
return labels[id] ? `${labels[id]} (${id})` : id;
}
function renderValue(dataValue, labels = {}) {
if (!dataValue) return "";
const { type, value } = dataValue;
if (value?.time) {
const dt = value.time.replace("+", "").replace("T00:00:00Z", "");
const calendar =
{
[URL_PROLEPTIC_GREGORIAN_CALENDAR]: "Gregorian",
[URL_PROLEPTIC_JULIAN_CALENDAR]: "Julian",
}[value.calendarmodel] || "Unknown";
const prec = precisionLabels[value.precision] || value.precision;
return `${dt} [${calendar}, ${prec}]`;
}
if (type === "wikibase-entityid") return renderEntity(value?.id, labels);
if (type === "string") return value;
if (type === "globecoordinate")
return `(${value.latitude}, ${value.longitude})`;
if (typeof value === "string") return value;
return JSON.stringify(dataValue);
}
function renderLink({ id, claimId, labels = {}, datavalue, isUrl = false }) {
const rawText = id
? renderEntity(id, labels)
: renderValue(datavalue, labels);
const text = isUrl ? formatUrlForDisplay(rawText) : rawText;
return $("<a>").attr("href", `#${claimId}`).text(text);
}
function renderPidLink(pid, text) {
return $("<a>")
.attr("href", `/wiki/Property:${pid}`)
.text(text || pid);
}
function renderQidLink(qid, text) {
return $("<a>")
.attr("href", `/wiki/${qid}`)
.text(text || qid);
}
/** Append a cell value to a <td>. Handles jQuery, DOM elements, and plain text. */
function renderCell(cell) {
const td = $("<td>");
if (cell instanceof jQuery || cell instanceof HTMLElement) {
td.append(cell);
} else {
td.text(String(cell ?? ""));
}
return td;
}
// ==== 22 UI rendering =====================================================
function renderCleanupUI(entity, previewRows, applyCleanupChanges) {
const box = $(`
<div id="WikidataCleanup_box" style="margin-top:10px; border:1px solid #ccc; padding:10px;">
<h3>${getMsg("cleanupTitle")}</h3>
<p>${getMsg("cleanupIntro")}</p>
<div id="WikidataCleanup_preview"></div>
</div>
`);
const checkboxStates = {};
const rowStates = {};
const rowWidgets = {};
const preview = box.find("#WikidataCleanup_preview");
const runBtn = new OO.ui.ButtonWidget({
label: getMsg("runButton"),
flags: ["progressive", "primary"],
});
runBtn.on("click", () => applyCleanupChanges(checkboxStates, rowStates));
const settingsBtn = new OO.ui.ButtonWidget({
label: getMsg("settingsButton"),
});
settingsBtn.on("click", () =>
showSettingsDialog(cache_loadSettings(), cache_saveSettings),
);
const controls = $("<div>").css({
marginTop: "10px",
display: "flex",
justifyContent: "space-between",
});
controls.append(runBtn.$element, settingsBtn.$element);
box.append(controls);
if (!previewRows.length) {
preview.html(
`<div class="mw-message-box mw-message-box-notice"><strong>${getMsg("noCleanups")}</strong></div>`,
);
} else {
const labels = collectPropertyLabels();
const grouped = {};
for (const row of previewRows) {
(grouped[row.type] ||= []).push(row);
}
for (const cat of Object.keys(grouped)) {
const def = detectors[cat];
if (!def) continue;
let rows = grouped[cat];
if (def.headers.includes("property")) {
rows = rows
.slice()
.sort(
(a, b) =>
parseInt(a.pid.replace("P", ""), 10) -
parseInt(b.pid.replace("P", ""), 10),
);
}
const master = new OO.ui.CheckboxInputWidget({ selected: true });
checkboxStates[cat] = true;
const masterInput = master.$element.find("input")[0];
const labelWidget = new OO.ui.LabelWidget({ label: getMsg(def.label) });
const row = $("<div>").css({ margin: "6px 0" });
row.append(master.$element, labelWidget.$element);
preview.append(row);
const thead = $("<thead><tr></tr></thead>");
["", ...def.headers.map((h) => getMsg(h))].forEach((h) =>
thead.find("tr").append($("<th>").text(h)),
);
const colgroup = $("<colgroup>").append('<col style="width:2.2em">');
for (let i = 0; i < def.headers.length; i++) colgroup.append("<col>");
const table = $(
'<table class="wikitable" style="width:100%; table-layout:fixed; margin-bottom:10px;">',
).append(colgroup, thead, "<tbody></tbody>");
const tbody = table.find("tbody");
rows.forEach((rowObj, idx) => {
const rowId = rowObj.rowId || `${cat}_${idx}`;
rowStates[rowId] = true;
const rowCb = new OO.ui.CheckboxInputWidget({ selected: true });
rowWidgets[rowId] = rowCb;
rowCb.on("change", (val) => {
rowStates[rowId] = val;
updateMasterCheckbox();
});
const tr = $("<tr>").append($("<td>").append(rowCb.$element));
def
.renderRow(rowObj, labels, entity)
.forEach((cell) => tr.append(renderCell(cell)));
tbody.append(tr);
});
preview.append(table);
$(masterInput).on("click", () => {
const checked = masterInput.checked;
checkboxStates[cat] = checked;
grouped[cat].forEach((rowObj) => {
rowStates[rowObj.rowId] = checked;
rowWidgets[rowObj.rowId].setSelected(checked, true);
});
masterInput.indeterminate = false;
});
function updateMasterCheckbox() {
const checkedCount = grouped[cat].filter(
(r) => rowStates[r.rowId],
).length;
if (checkedCount === 0) {
masterInput.checked = false;
masterInput.indeterminate = false;
checkboxStates[cat] = false;
} else if (checkedCount === grouped[cat].length) {
masterInput.checked = true;
masterInput.indeterminate = false;
checkboxStates[cat] = true;
} else {
masterInput.checked = false;
masterInput.indeterminate = true;
checkboxStates[cat] = true;
}
}
}
}
$(".wikibase-entitytermsview-entitytermsforlanguagelistview").append(box);
}
// ==== 23 Diff application =================================================
function replacePropertyInReference(ref, oldProp, newProp, snakHash) {
if (!ref.snaks?.[oldProp]) return;
const oldIdx = ref["snaks-order"].indexOf(oldProp);
const newIdx = ref["snaks-order"].indexOf(newProp);
const [toMove, keepOld] = ref.snaks[oldProp].reduce(
([mv, kp], snak) => {
if (snak.hash === snakHash) {
snak.property = newProp;
delete snak.hash;
return [[...mv, snak], kp];
}
return [mv, [...kp, snak]];
},
[[], []],
);
if (!toMove.length) return;
ref.snaks[newProp] = ref.snaks[newProp] || [];
if (oldIdx !== -1 && newIdx !== -1 && oldIdx < newIdx) {
ref.snaks[newProp] = toMove.concat(ref.snaks[newProp]);
} else {
ref.snaks[newProp] = ref.snaks[newProp].concat(toMove);
}
if (oldIdx !== -1) ref["snaks-order"][oldIdx] = newProp;
ref["snaks-order"] = [...new Set(ref["snaks-order"])];
if (keepOld.length) ref.snaks[oldProp] = keepOld;
else delete ref.snaks[oldProp];
}
/**
* Applies a list of cleanup diffs to deep-cloned copies of the entity's claims
* and returns a wbeditentity-ready update object containing the modified claims,
* labels, descriptions, and aliases.
*/
function mergeCleanupDiffs(entity, diffs) {
const updates = { claims: [] };
const claimMap = new Map();
function getOrCloneClaim(pid, claimId) {
let merged = claimMap.get(claimId);
if (!merged) {
const orig = (entity.claims[pid] || []).find((c) => c.id === claimId);
if (!orig) return null;
merged = JSON.parse(JSON.stringify(orig));
claimMap.set(claimId, merged);
}
return merged._remove ? null : merged;
}
for (const diff of diffs) {
switch (diff.action) {
case ACTION_REMOVE_REFS: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const idx = claim.references.findLastIndex(
(r) => r.hash === diff.refHash,
);
if (idx !== -1) claim.references.splice(idx, 1);
break;
}
case ACTION_REMOVE_CLAIM: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (claim) claim._remove = true;
break;
}
case ACTION_SPLIT_REFERENCE_URLS: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const idx = claim.references.findLastIndex(
(r) => r.hash === diff.refHash,
);
if (idx === -1) break;
const before = claim.references.slice(0, idx);
const after = claim.references.slice(idx + 1);
const refToSplit = claim.references[idx];
// Collect splittable snaks (excluding P813 and P2960)
const allEntries = [];
for (const pid in refToSplit.snaks || {}) {
if (pid === PID_RETRIEVED || pid === PID_ARCHIVE_DATE) continue;
for (const snak of refToSplit.snaks[pid] || [])
allEntries.push({ pid, snak });
}
if (allEntries.length <= 1) break;
const retrievedSnaks = refToSplit.snaks[PID_RETRIEVED] || [];
const latestRetrieved = retrievedSnaks.length
? retrievedSnaks.reduce((max, cur) =>
(cur.datavalue?.value?.time || "") >
(max.datavalue?.value?.time || "")
? cur
: max,
)
: null;
const ar_urls = refToSplit.snaks[PID_ARCHIVE_URL] || [];
const ar_dates = refToSplit.snaks[PID_ARCHIVE_DATE] || [];
if (ar_urls.length > 1 || ar_dates.length > 1) break;
// Identify and extract archive entries
let archiveEntry = null;
const archiveIdx = allEntries.findIndex((e) => {
if (e.pid === PID_ARCHIVE_URL) return true;
const raw = e.snak.datavalue?.value;
const v = typeof raw === "string" ? raw : raw?.id || "";
try {
return isArchiveUrl(v);
} catch {
return false;
}
});
if (archiveIdx !== -1) {
// Check there is only one archive entry
const archiveCount = allEntries.filter((e, i) => {
if (e.pid === PID_ARCHIVE_URL) return true;
const raw = e.snak.datavalue?.value;
const v = typeof raw === "string" ? raw : raw?.id || "";
try {
return isArchiveUrl(v);
} catch {
return false;
}
}).length;
if (archiveCount > 1) break;
archiveEntry = allEntries.splice(archiveIdx, 1)[0];
}
const newRefs = allEntries
.map((entry) => {
const rawVal = entry.snak.datavalue?.value || "";
const val =
typeof rawVal === "string" ? rawVal : rawVal?.id || "";
let mappedPid = entry.pid;
if (
[
PID_REFERENCE_URL,
PID_WIKIMEDIA_IMPORT_URL,
PID_ARCHIVE_URL,
].includes(entry.pid)
) {
if (isArchiveUrl(val)) mappedPid = PID_ARCHIVE_URL;
else if (isWikimediaImportUrl(val))
mappedPid = PID_WIKIMEDIA_IMPORT_URL;
else mappedPid = PID_REFERENCE_URL;
}
if (mappedPid === PID_ARCHIVE_URL && archiveEntry) return null; // handled below
const snaks = { [mappedPid]: [entry.snak] };
const snaksOrder = [mappedPid];
if (latestRetrieved && mappedPid !== PID_ARCHIVE_URL) {
snaks[PID_RETRIEVED] = [latestRetrieved];
snaksOrder.push(PID_RETRIEVED);
}
return { snaks, "snaks-order": snaksOrder };
})
.filter(Boolean);
// Build archive ref last
let arRef = null;
if (archiveEntry) {
const arSnaks = { [PID_ARCHIVE_URL]: [archiveEntry.snak] };
const arSnaksOrder = [PID_ARCHIVE_URL];
if (ar_dates.length) {
arSnaks[PID_ARCHIVE_DATE] = [ar_dates[0]];
arSnaksOrder.push(PID_ARCHIVE_DATE);
}
arRef = { snaks: arSnaks, "snaks-order": arSnaksOrder };
} else if (ar_urls.length === 1) {
const arSnaks = { [PID_ARCHIVE_URL]: [ar_urls[0]] };
const arSnaksOrder = [PID_ARCHIVE_URL];
if (ar_dates.length) {
arSnaks[PID_ARCHIVE_DATE] = [ar_dates[0]];
arSnaksOrder.push(PID_ARCHIVE_DATE);
}
arRef = { snaks: arSnaks, "snaks-order": arSnaksOrder };
}
if (arRef) newRefs.push(arRef);
claim.references = [...before, ...newRefs, ...after];
break;
}
case ACTION_MERGE_CLAIM: {
const from = getOrCloneClaim(diff.pid, diff.fromClaimId);
const to = getOrCloneClaim(diff.pid, diff.toClaimId);
if (!from || !to) break;
// Merge references
const toRefHashes = new Set((to.references || []).map((r) => r.hash));
to.references = (to.references || []).concat(
(from.references || []).filter((r) => !toRefHashes.has(r.hash)),
);
// Merge qualifiers
for (const pid in from.qualifiers || {}) {
const toHashes = new Set(
(to.qualifiers?.[pid] || []).map((s) => s.hash),
);
const newSnaks = (from.qualifiers[pid] || []).filter(
(s) => !toHashes.has(s.hash),
);
if (newSnaks.length) {
to.qualifiers = to.qualifiers || {};
to.qualifiers[pid] = (to.qualifiers[pid] || []).concat(newSnaks);
}
}
from._remove = true;
break;
}
case ACTION_DOWNGRADE_PREFERRED: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
claim.rank = "normal";
// When called from detectUpgradePreciseDate the source claim is
// deprecated (fromDeprecated:true) and carries P2241 instead of P7452.
const qualToRemove = diff.fromDeprecated
? PID_REASON_FOR_DEPRECATED_RANK
: PID_REASON_FOR_PREFERRED_RANK;
if (diff.removedQualifier && claim.qualifiers?.[qualToRemove]) {
delete claim.qualifiers[qualToRemove];
if (claim["qualifiers-order"]) {
claim["qualifiers-order"] = claim["qualifiers-order"].filter(
(p) => p !== qualToRemove,
);
}
}
break;
}
case ACTION_UPGRADE_PRECISE_DATE: {
// Set the normal-rank precise date claim to preferred rank and add
// P7452 = Q71536040 (most precise value) as a qualifier.
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
claim.rank = "preferred";
claim.qualifiers = claim.qualifiers || {};
claim["qualifiers-order"] = claim["qualifiers-order"] || [];
// Add P7452 = Q71536040 only when not already present
if (!claim.qualifiers[PID_REASON_FOR_PREFERRED_RANK]) {
claim.qualifiers[PID_REASON_FOR_PREFERRED_RANK] = [
{
snaktype: "value",
property: PID_REASON_FOR_PREFERRED_RANK,
datavalue: {
value: {
"entity-type": "item",
"numeric-id": parseInt(
QID_MOST_PRECISE.replace("Q", ""),
10,
),
id: QID_MOST_PRECISE,
},
type: "wikibase-entityid",
},
},
];
if (
!claim["qualifiers-order"].includes(PID_REASON_FOR_PREFERRED_RANK)
) {
claim["qualifiers-order"].unshift(PID_REASON_FOR_PREFERRED_RANK);
}
}
break;
}
case ACTION_CHANGE_PROPERTY: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim || diff.context !== "reference") break;
const refIdx = claim.references.findIndex(
(r) => r.hash === diff.refHash,
);
if (refIdx !== -1) {
replacePropertyInReference(
claim.references[refIdx],
diff.oldProperty,
diff.newProperty,
diff.snakHash,
);
}
break;
}
case ACTION_CHANGE_VALUE: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim || diff.context !== "reference") break;
const ref = claim.references.find((r) => r.hash === diff.refHash);
if (!ref) break;
for (const snakPid in ref.snaks || {}) {
for (const snak of ref.snaks[snakPid]) {
if (snak.hash === diff.snakHash) {
snak.datavalue = snak.datavalue || {};
snak.datavalue.value = snak.datavalue.value || {};
snak.datavalue.value.id = diff.newValue;
// Keep numeric-id in sync for wikibase-entityid snaks (QIDs).
if (isQid(diff.newValue)) {
snak.datavalue.value["numeric-id"] = parseInt(
diff.newValue.replace("Q", ""),
10,
);
}
delete snak.hash;
}
}
}
break;
}
case ACTION_REMOVE_QUALIFIER: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const qPid = diff.qualifierPid;
const filtered = (claim.qualifiers?.[qPid] || []).filter(
(s) => s.hash !== diff.qualifierSnakHash,
);
if (filtered.length === 0) {
delete claim.qualifiers[qPid];
if (claim["qualifiers-order"]) {
claim["qualifiers-order"] = claim["qualifiers-order"].filter(
(p) => p !== qPid,
);
}
} else {
claim.qualifiers[qPid] = filtered;
}
break;
}
case ACTION_CLEAN_URL: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
if (diff.context === "claim") {
// Top-level mainsnak
if (claim.mainsnak?.datavalue?.value === diff.before) {
claim.mainsnak.datavalue.value = diff.after;
delete claim.mainsnak.hash;
}
} else if (diff.context === "qualifier") {
// Qualifier snak identified by PID + hash
for (const snak of claim.qualifiers?.[diff.snakPid] || []) {
if (
snak.hash === diff.snakHash &&
snak.datavalue?.value === diff.before
) {
snak.datavalue.value = diff.after;
delete snak.hash;
break;
}
}
} else {
// Reference snak (context === "reference" or legacy diffs without context)
const ref = claim.references?.find((r) => r.hash === diff.refHash);
if (!ref) break;
// snakPid may be absent in old diffs; fall back to PID_REFERENCE_URL
const targetPid = diff.snakPid || PID_REFERENCE_URL;
for (const snak of ref.snaks?.[targetPid] || []) {
if (
snak.datatype === "url" &&
snak.datavalue?.value === diff.before
) {
snak.datavalue.value = diff.after;
delete snak.hash;
break;
}
}
}
break;
}
case ACTION_MOVE_QUALIFIER_TO_REFERENCE: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const retrievedQuals = claim.qualifiers?.[PID_RETRIEVED] || [];
if (!retrievedQuals.length) break;
const p813Snaks = retrievedQuals.map((q) => ({
snaktype: q.snaktype || "value",
property: PID_RETRIEVED,
datavalue: q.datavalue
? JSON.parse(JSON.stringify(q.datavalue))
: undefined,
}));
const refs = claim.references || [];
const hasP813 = (r) => (r?.snaks?.[PID_RETRIEVED] || []).length > 0;
if (refs.length === 1 && !hasP813(refs[0])) {
const target = refs[0];
target.snaks = target.snaks || {};
target["snaks-order"] = target["snaks-order"] || [];
target.snaks[PID_RETRIEVED] = (
target.snaks[PID_RETRIEVED] || []
).concat(p813Snaks);
if (!target["snaks-order"].includes(PID_RETRIEVED))
target["snaks-order"].push(PID_RETRIEVED);
} else {
claim.references = refs.concat([
{
snaks: { [PID_RETRIEVED]: p813Snaks },
"snaks-order": [PID_RETRIEVED],
},
]);
}
delete claim.qualifiers[PID_RETRIEVED];
break;
}
case ACTION_REMOVE_ALIAS: {
updates.aliases = updates.aliases || {};
const current =
updates.aliases[diff.lang] || entity.aliases[diff.lang] || [];
updates.aliases[diff.lang] = current.filter(
(a) => a.value !== diff.value,
);
break;
}
case ACTION_NORMALIZE: {
if (diff.field === "label") {
updates.labels = updates.labels || {};
updates.labels[diff.lang] = {
language: diff.lang,
value: diff.after,
};
} else if (diff.field === "description") {
updates.descriptions = updates.descriptions || {};
updates.descriptions[diff.lang] = {
language: diff.lang,
value: diff.after,
};
} else if (diff.field === "alias") {
updates.aliases = updates.aliases || {};
const current =
updates.aliases[diff.lang] || entity.aliases[diff.lang] || [];
updates.aliases[diff.lang] = current.map((a) =>
a.value === diff.before
? { language: diff.lang, value: diff.after }
: a,
);
}
break;
}
case ACTION_ADD_EXTERNAL_ID_TO_REFERENCE: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const ref = claim.references.find((r) => r.hash === diff.refHash);
if (!ref) break;
ref.snaks = ref.snaks || {};
ref["snaks-order"] = ref["snaks-order"] || [];
const insertBeforeP854 = (pid) => {
if (!ref["snaks-order"].includes(pid)) {
const p854i = ref["snaks-order"].indexOf(PID_REFERENCE_URL);
if (p854i !== -1) ref["snaks-order"].splice(p854i, 0, pid);
else ref["snaks-order"].push(pid);
}
};
// Add P248 (stated in) if missing
if (
!ref.snaks[PID_STATED_IN] &&
propertyStatedInCache.has(diff.suggestedProperty)
) {
const statedInData = propertyStatedInCache.get(
diff.suggestedProperty,
);
if (statedInData?.preferred) {
ref.snaks[PID_STATED_IN] = [
{
snaktype: "value",
property: PID_STATED_IN,
datavalue: {
value: {
"entity-type": "item",
"numeric-id": parseInt(
statedInData.preferred.replace("Q", ""),
10,
),
id: statedInData.preferred,
},
type: "wikibase-entityid",
},
},
];
insertBeforeP854(PID_STATED_IN);
}
}
// Add the extracted external ID
ref.snaks[diff.suggestedProperty] = (
ref.snaks[diff.suggestedProperty] || []
).concat([
{
snaktype: "value",
property: diff.suggestedProperty,
datavalue: { value: diff.extractedId, type: "string" },
},
]);
insertBeforeP854(diff.suggestedProperty);
// Remove P854 if not needed
if (!diff.keepUrl) {
delete ref.snaks[PID_REFERENCE_URL];
ref["snaks-order"] = ref["snaks-order"].filter(
(p) => p !== PID_REFERENCE_URL,
);
}
break;
}
case ACTION_ABSORB_CLAIM: {
// 1. Mark the source claim for removal (may be a P1343 claim or a URL-type claim).
const p1343 = getOrCloneClaim(diff.pid, diff.claimId);
if (p1343) p1343._remove = true;
// 2. Move the source claim's references to the ext-id claim, injecting any
// qualifiers from the source claim as extra values inside each moved reference.
// If there are no references but qualifiers are present, synthesise a
// new bare reference on the ext-id claim from those qualifiers alone.
const extIdClaim = getOrCloneClaim(diff.extIdPid, diff.extIdClaimId);
if (!extIdClaim) break;
extIdClaim.references = extIdClaim.references || [];
// Build qualifier snaks (hash-stripped, ready to embed into a reference).
// P2699 (URL) is remapped to P854 (reference URL) because URL is a statement
// qualifier property whereas P854 is its reference counterpart.
const qualSnaksByPid = {};
const qualOrder = [];
for (const qPid of diff.qualifiersOrder || []) {
const refPid = qPid === PID_URL ? PID_REFERENCE_URL : qPid;
const snaks = (diff.qualifiers?.[qPid] || []).map((s) => ({
snaktype: s.snaktype,
property: refPid,
datavalue: s.datavalue
? JSON.parse(JSON.stringify(s.datavalue))
: undefined,
}));
if (snaks.length) {
qualSnaksByPid[refPid] = snaks;
if (!qualOrder.includes(refPid)) qualOrder.push(refPid);
}
}
if (diff.refHashes?.length) {
// Migrate existing references from the source claim, enriching each with qualifier values.
// Read from the original entity — the clone only carries _remove
const p1343Original = (entity.claims[diff.pid] || []).find(
(c) => c.id === diff.claimId,
);
const existingHashes = new Set(
extIdClaim.references.map((r) => r.hash),
);
const refsToMove = (p1343Original?.references || []).filter(
(r) =>
diff.refHashes.includes(r.hash) && !existingHashes.has(r.hash),
);
for (const origRef of refsToMove) {
const newRef = JSON.parse(JSON.stringify(origRef));
delete newRef.hash;
newRef.snaks = newRef.snaks || {};
newRef["snaks-order"] = newRef["snaks-order"] || [];
for (const qPid of qualOrder) {
if (!newRef.snaks[qPid]) {
newRef.snaks[qPid] = qualSnaksByPid[qPid];
newRef["snaks-order"].push(qPid);
}
// Leave existing pid snaks untouched to avoid value conflicts
}
extIdClaim.references.push(newRef);
}
} else if (qualOrder.length) {
// No references on P1343, but it has qualifiers — synthesise a new
// reference on the ext-id claim consisting solely of those qualifier snaks
const newRef = {
snaks: Object.fromEntries(
qualOrder.map((p) => [p, qualSnaksByPid[p]]),
),
"snaks-order": qualOrder.slice(),
};
extIdClaim.references.push(newRef);
}
break;
}
case ACTION_REMOVE_REDUNDANT_REF_URL: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const ref = claim.references?.find((r) => r.hash === diff.refHash);
if (!ref) break;
// Remove only the specific redundant P854 snak identified by snakHash.
const kept = (ref.snaks[PID_REFERENCE_URL] || []).filter(
(s) => s.hash !== diff.snakHash,
);
if (kept.length) {
ref.snaks[PID_REFERENCE_URL] = kept;
} else {
delete ref.snaks[PID_REFERENCE_URL];
ref["snaks-order"] = (ref["snaks-order"] || []).filter(
(p) => p !== PID_REFERENCE_URL,
);
}
break;
}
case ACTION_REMOVE_OBSOLETE_SNAKS: {
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
const ref = claim.references?.find((r) => r.hash === diff.refHash);
if (!ref) break;
for (const obsoletePid of diff.obsoletePids || []) {
delete ref.snaks[obsoletePid];
ref["snaks-order"] = (ref["snaks-order"] || []).filter(
(p) => p !== obsoletePid,
);
}
break;
}
case ACTION_DEPRECATE_URL_CLAIM: {
// Set the claim rank to "deprecated".
// Qualifiers and references are intentionally left in place so that the
// human reviewer can see why the claim existed.
// When a P2241 (reason for deprecated rank) QID is supplied by the
// blocklist rule, add it as a qualifier.
const claim = getOrCloneClaim(diff.pid, diff.claimId);
if (!claim) break;
claim.rank = "deprecated";
if (diff.deprecationReason && isQid(diff.deprecationReason)) {
claim.qualifiers = claim.qualifiers || {};
claim["qualifiers-order"] = claim["qualifiers-order"] || [];
if (!claim.qualifiers[PID_REASON_FOR_DEPRECATED_RANK]) {
claim.qualifiers[PID_REASON_FOR_DEPRECATED_RANK] = [
{
snaktype: "value",
property: PID_REASON_FOR_DEPRECATED_RANK,
datavalue: {
value: {
"entity-type": "item",
"numeric-id": parseInt(
diff.deprecationReason.replace("Q", ""),
10,
),
id: diff.deprecationReason,
},
type: "wikibase-entityid",
},
},
];
if (
!claim["qualifiers-order"].includes(
PID_REASON_FOR_DEPRECATED_RANK,
)
) {
claim["qualifiers-order"].push(PID_REASON_FOR_DEPRECATED_RANK);
}
}
}
break;
}
case ACTION_SET_MUL_LABEL: {
// Add labels.mul with the shared label value.
updates.labels = updates.labels || {};
updates.labels.mul = { language: "mul", value: diff.value };
break;
}
case ACTION_ADD_MUL_ALIAS: {
// Add diff.value to aliases.mul, preserving any existing mul aliases.
// Start from the already-accumulated updates.aliases.mul when multiple
// ADD_MUL_ALIAS diffs fire in the same edit (shouldn't happen, but safe).
updates.aliases = updates.aliases || {};
const existingMul = updates.aliases.mul
? updates.aliases.mul.slice()
: (entity.aliases?.mul || []).map((a) => ({ ...a }));
const normNew = normalizeText(diff.value);
if (!existingMul.some((a) => normalizeText(a.value) === normNew)) {
existingMul.push({ language: "mul", value: diff.value });
}
updates.aliases.mul = existingMul;
break;
}
case ACTION_CONVERT_URL_TO_EXT_ID: {
// Dev-mode only (DEV_ABSORB_URL_CLAIM_CREATE_MISSING).
// Creates a new ext-id statement from the matched URL, transferring the
// URL claim's qualifiers and references onto it, then removes the URL claim.
// 1. Mark the URL claim for removal.
const urlClaim = getOrCloneClaim(diff.pid, diff.claimId);
if (urlClaim) urlClaim._remove = true;
// 2. Build the new ext-id claim object.
const newExtIdClaim = {
type: "statement",
rank: "normal",
mainsnak: {
snaktype: "value",
property: diff.extIdPid,
datavalue: { value: diff.extractedId, type: "string" },
datatype: "external-id",
},
qualifiers: {},
"qualifiers-order": [],
references: [],
};
// 3. Copy qualifiers from the URL claim (P2699/URL remapped to P854).
for (const qPid of diff.qualifiersOrder || []) {
const refPid = qPid === PID_URL ? PID_REFERENCE_URL : qPid;
const snaks = (diff.qualifiers?.[qPid] || []).map((s) => ({
snaktype: s.snaktype,
property: refPid,
datavalue: s.datavalue
? JSON.parse(JSON.stringify(s.datavalue))
: undefined,
}));
if (snaks.length) {
newExtIdClaim.qualifiers[refPid] = snaks;
newExtIdClaim["qualifiers-order"].push(refPid);
}
}
// 4. Copy references from the original URL claim.
const urlOriginal = (entity.claims[diff.pid] || []).find(
(c) => c.id === diff.claimId,
);
for (const origRef of (urlOriginal?.references || []).filter((r) =>
diff.refHashes.includes(r.hash),
)) {
const newRef = JSON.parse(JSON.stringify(origRef));
delete newRef.hash;
newExtIdClaim.references.push(newRef);
}
// 5. Register the new claim so mergeCleanupDiffs includes it in the API call.
// Give it a temporary id so getOrCloneClaim can track it if needed.
newExtIdClaim.id = `NEW_EXT_ID_${diff.extIdPid}_${diff.claimId}`;
claimMap.set(newExtIdClaim.id, newExtIdClaim);
break;
}
}
}
// Deduplicate aliases
for (const lang in updates.aliases || {}) {
updates.aliases[lang] = updates.aliases[lang].filter(
(a, i, arr) => arr.findIndex((b) => b.value === a.value) === i,
);
}
for (const claim of claimMap.values()) {
if (claim._remove) {
updates.claims.push({ id: claim.id, remove: "" });
} else if (claim.id?.startsWith("NEW_EXT_ID_")) {
// New claim created by ACTION_CONVERT_URL_TO_EXT_ID — strip the temporary id
// so the API treats it as a create rather than an edit.
const { id: _tempId, ...newClaim } = claim;
updates.claims.push(newClaim);
} else {
updates.claims.push(claim);
}
}
return updates;
}
// ==== 24 Core flow ========================================================
/**
* Runs all active detectors against the entity in batches and collects their
* diffs. Returns { previewRows, updates, modified } where previewRows is the
* list of visible diff objects shown in the UI table.
*/
async function generatePreviewDiffs(entity, settings) {
const updates = { claims: [] };
const previewRows = [];
let modified = false;
const activeDefs = Object.entries(detectors).filter(([id, def]) => {
if (settings?.enabledDetectors?.[id] === false) return false;
if (def.requiresLargeBuffers && !settings?.enableLargeBuffers)
return false;
if (def.requiresHeavyComputing && !settings?.enableHeavyComputing)
return false;
return true;
});
// Run all source-category detectors (those backed by detectRefCategories, indicated
// by detect: null) in a single shared pass so each reference is classified only once
// regardless of how many such detectors are active.
// Note: isRemoveRefCategory is intentionally not used here — it is also true for
// dupRetrieved, which has its own detect function and must not use the shared pass.
const sourceCategoryDefs = activeDefs.filter(
([, def]) => def.detect === null,
);
const sharedRefResults = sourceCategoryDefs.length
? detectRefCategories(
entity,
sourceCategoryDefs.map(([id]) => id),
)
: new Map();
const BATCH_SIZE = 3;
const results = [];
for (let i = 0; i < activeDefs.length; i += BATCH_SIZE) {
const batch = activeDefs.slice(i, i + BATCH_SIZE);
const batchResults = await Promise.all(
batch.map(([id, def]) =>
def.detect === null
? Promise.resolve(sharedRefResults.get(id) ?? [])
: Promise.resolve(def.detect(entity)),
),
);
results.push(...batchResults);
if (i + BATCH_SIZE < activeDefs.length) {
await new Promise((resolve) => setTimeout(resolve, 0));
}
}
// Collect raw results per detector id before post-processing.
// detectAbsorbUrlClaim may emit ACTION_CONVERT_URL_TO_EXT_ID diffs (dev mode);
// split those into their own bucket so they render under the correct registry entry.
const rawByDef = new Map();
activeDefs.forEach(([id], index) => {
const diffs = results[index] || [];
if (id === "absorbUrlClaim" && DEV_ABSORB_URL_CLAIM_CREATE_MISSING) {
rawByDef.set(
"absorbUrlClaim",
diffs.filter((d) => d.action !== ACTION_CONVERT_URL_TO_EXT_ID),
);
rawByDef.set(
"convertUrlToExtId",
diffs.filter((d) => d.action === ACTION_CONVERT_URL_TO_EXT_ID),
);
} else {
// Merge with any existing bucket (e.g. convertUrlToExtId pre-populated by the split above).
const existing = rawByDef.get(id);
rawByDef.set(id, existing ? existing.concat(diffs) : diffs);
}
});
// Post-filter: suppress any ACTION_ADD_EXTERNAL_ID_TO_REFERENCE diff whose
// reference is about to be moved by ACTION_ABSORB_DESCRIBED_BY_SOURCE.
// The absorb action transplants the entire reference onto the ext-id claim,
// so adding an ext-id snak to it first would produce a self-referential
// reference (ext-id claim -> reference containing the same ext-id value).
const absorbedRefKeys = new Set();
for (const detectorId of ["absorbDescribedBySource", "absorbUrlClaim"]) {
for (const diff of rawByDef.get(detectorId) || []) {
for (const hash of diff.refHashes || []) {
absorbedRefKeys.add(`${diff.claimId}::${hash}`);
}
}
}
if (absorbedRefKeys.size) {
const addDiffs = rawByDef.get("addExternalIdToReference");
if (addDiffs?.length) {
rawByDef.set(
"addExternalIdToReference",
addDiffs.filter(
(d) => !absorbedRefKeys.has(`${d.claimId}::${d.refHash}`),
),
);
}
}
// Iterate rawByDef directly (not activeDefs) so that synthetic buckets like
// "convertUrlToExtId" — which are populated by splitting another detector's
// results rather than by running their own detect function — are also included.
for (const [id, diffs] of rawByDef) {
if (!diffs?.length) continue;
diffs.forEach((c, idx) => {
c.type = id;
c.rowId = c.rowId || `${id}_${idx}`;
});
// _hidden diffs (e.g. the companion downgrade in upgradePreciseDate) are
// applied together with their visible counterpart but not shown in the table.
previewRows.push(...diffs.filter((c) => !c._hidden));
updates[`updates_${id}`] = { changes: diffs };
modified = true;
}
console.log(`${TOOL_NAME}: ${previewRows.length} issues found`);
return { previewRows, updates, modified };
}
/** Builds the Wikidata edit summary string from the selected cleanup categories. */
function buildSummary(checkboxStates) {
const selected = Object.keys(checkboxStates).filter(
(k) => checkboxStates[k],
);
const refParts = selected
.filter((id) => detectors[id]?.isRemoveRefCategory)
.map((id) => detectors[id].summaryLabel);
const otherParts = selected
.filter((id) => !detectors[id]?.isRemoveRefCategory)
.map((id) => detectors[id]?.summaryLabel || id);
const parts = [];
if (refParts.length) parts.push("remove " + refParts.join("+") + " refs");
parts.push(...otherParts);
return `Cleanup: ${parts.join("; ")} ([[User:Difool/WikidataCleanup]])`;
}
/**
* Merges the diffs selected by the user, then submits a wbeditentity API call
* and reloads the page on success.
*/
function runCleanupCallback(
entityId,
entity,
updates,
checkboxStates,
rowStates,
) {
const allDiffs = [];
for (const [id, def] of Object.entries(detectors)) {
if (!checkboxStates[id]) continue;
for (const c of updates[`updates_${id}`]?.changes || []) {
if (rowStates[c.rowId] !== false) allDiffs.push(c);
}
}
const filteredUpdates = mergeCleanupDiffs(entity, allDiffs);
const summary = buildSummary(checkboxStates);
new mw.Api()
.postWithEditToken({
action: "wbeditentity",
format: "json",
id: entityId,
summary,
baserevid: mw.config.get("wgRevisionId"),
data: JSON.stringify(filteredUpdates),
})
.then((resp) => {
if (resp?.success === 1) window.location.reload();
else alert(getMsg("applyFailed"));
})
.catch((errorCode, error) => {
// mw.Api jQuery deferred collapses (errorCode, error) into the first argument
const errObj =
(typeof errorCode === "object" && errorCode) ||
(typeof error === "object" && error);
const apiError = errObj?.error || errObj;
const detail = apiError?.info;
console.error(`${TOOL_NAME}: edit error`, errorCode, error);
// alert(
// detail
// ? `${getMsg("cleanupFailed")}\n\n${detail}`
// : getMsg("cleanupFailed"),
// );
mw.notify(detail || getMsg("cleanupFailed"), {
type: "error",
autoHide: false,
tag: `${TOOL_NAME}_editError`,
});
});
}
function renderCleanupUISafe(entity, previewRows, applyCleanupChanges) {
if (typeof OO === "undefined" || !OO.ui?.ButtonWidget) {
mw.loader
.using(["oojs-ui-core", "oojs-ui-widgets"])
.then(() => renderCleanupUI(entity, previewRows, applyCleanupChanges))
.catch((err) =>
console.error(`${TOOL_NAME}: failed to load OOUI`, err),
);
return;
}
renderCleanupUI(entity, previewRows, applyCleanupChanges);
}
async function initCleanupTool(settings, { menuclick = false } = {}) {
$("#WikidataCleanup_box").remove();
await initCaches(settings);
try {
const entityId = mw.config.get("wgPageName");
const data = await new mw.Api().get({
action: "wbgetentities",
format: "json",
ids: entityId,
props: "claims|labels|descriptions|aliases|sitelinks",
});
const entity = data.entities[entityId];
if (!entity?.claims) return;
const { previewRows, updates, modified } = await generatePreviewDiffs(
entity,
settings,
);
if (modified || menuclick) {
renderCleanupUISafe(entity, previewRows, (checkboxStates, rowStates) =>
runCleanupCallback(
entityId,
entity,
updates,
checkboxStates,
rowStates,
),
);
if (menuclick) {
const box = document.getElementById("WikidataCleanup_box");
box?.scrollIntoView({ behavior: "smooth", block: "start" });
box?.querySelector("button")?.focus();
}
}
} catch (err) {
console.error(`${TOOL_NAME}: API error`, err);
}
}
// ==== 25 Settings dialog ==================================================
function showSettingsDialog(currentSettings, saveSettings) {
const autoStartCb = new OO.ui.CheckboxInputWidget({
selected: !!currentSettings.autoStartPreview,
});
const largeBuffersCb = new OO.ui.CheckboxInputWidget({
selected: currentSettings.enableLargeBuffers !== false,
});
const heavyComputingCb = new OO.ui.CheckboxInputWidget({
selected: currentSettings.enableHeavyComputing !== false,
});
const generalFieldset = new OO.ui.FieldsetLayout({
label: getMsg("generalSettings"),
items: [
new OO.ui.FieldLayout(autoStartCb, {
label: getMsg("autoStart"),
align: "inline",
}),
new OO.ui.FieldLayout(largeBuffersCb, {
label: "Enable large buffers (IndexedDB caching)",
help: "Disabling reduces memory usage but disables some advanced features",
align: "inline",
}),
new OO.ui.FieldLayout(heavyComputingCb, {
label: "Enable heavy computing (complex analysis)",
help: "Disabling reduces CPU usage but disables some analysis features",
align: "inline",
}),
],
});
const detectorFields = Object.entries(detectors).map(([id, def]) => {
const cb = new OO.ui.CheckboxInputWidget({
selected: currentSettings.enabledDetectors?.[id] !== false,
});
cb.detectorId = id;
return new OO.ui.FieldLayout(cb, {
label: getMsg(def.label),
align: "inline",
});
});
const detectorFieldset = new OO.ui.FieldsetLayout({
label: getMsg("detectorSettings"),
items: detectorFields,
});
function buildCacheField(cacheDef) {
const info = new OO.ui.LabelWidget({ label: "Loading…" });
cache_getStatus(cacheDef).then((s) => info.setLabel(s));
const refreshStatus = async () => {
info.setLabel(await cache_getStatus(cacheDef));
};
const reloadBtn = new OO.ui.ButtonWidget({
label: cacheDef.fetchFn ? "Reload" : "Reset now",
flags: [cacheDef.fetchFn ? "progressive" : "destructive"],
});
reloadBtn.on("click", async () => {
if (cacheDef.fetchFn)
await refreshCacheWithNotify(cacheDef, currentSettings);
else await cache_reset(cacheDef);
await refreshStatus();
});
const clearBtn = new OO.ui.ButtonWidget({
label: "Clear cache",
flags: ["destructive"],
});
clearBtn.on("click", async () => {
await cache_reset(cacheDef);
await refreshStatus();
});
const items = [info, reloadBtn];
if (cacheDef.fetchFn) items.push(clearBtn); // only show when reload is available
return new OO.ui.PanelLayout({
padded: true,
expanded: false,
content: [new OO.ui.HorizontalLayout({ items })],
});
}
const cacheFieldset = new OO.ui.FieldsetLayout({
label: getMsg("cacheSettings"),
items: caches.map(buildCacheField),
});
// Dialog class (scoped to avoid polluting the outer IIFE)
const SettingsDialog = function (config) {
SettingsDialog.super.call(this, config);
};
OO.inheritClass(SettingsDialog, OO.ui.ProcessDialog);
SettingsDialog.static.name = "settingsDialog";
SettingsDialog.static.title = getMsg("settingsTitle");
SettingsDialog.static.size = "large";
SettingsDialog.static.actions = [
{ action: "close", label: getMsg("close"), flags: ["safe", "primary"] },
];
SettingsDialog.prototype.initialize = function () {
SettingsDialog.super.prototype.initialize.apply(this, arguments);
const content = new OO.ui.PanelLayout({ padded: true, expanded: true });
content.$element.append(
generalFieldset.$element,
detectorFieldset.$element,
cacheFieldset.$element,
);
this.$body.append(content.$element);
};
SettingsDialog.prototype.getActionProcess = function (action) {
if (action !== "close")
return SettingsDialog.super.prototype.getActionProcess.call(
this,
action,
);
return new OO.ui.Process(() => {
const newSettings = {
autoStartPreview: autoStartCb.isSelected(),
enableLargeBuffers: largeBuffersCb.isSelected(),
enableHeavyComputing: heavyComputingCb.isSelected(),
enabledDetectors: Object.fromEntries(
detectorFields.map((f) => [
f.fieldWidget.detectorId,
f.fieldWidget.isSelected(),
]),
),
};
saveSettings(newSettings);
$("#WikidataCleanup_box").remove();
initCleanupTool(newSettings);
this.close({ action: "close" });
});
};
const manager = new OO.ui.WindowManager();
$("body").append(manager.$element);
const dialog = new SettingsDialog();
manager.addWindows([dialog]);
manager.openWindow(dialog);
}
// ==== 26 Entry point ======================================================
function addCleanupLink() {
const label = getMsg("startPreview");
const clickHandler = () =>
initCleanupTool(cache_loadSettings(), { menuclick: true });
if (mw.config.get("skin") !== "minerva") {
const link = mw.util.addPortletLink(
"p-cactions",
"#",
label,
`${TOOL_NAME}_link`,
getMsg("runPreview"),
);
if (link) {
link.addEventListener("click", clickHandler);
return;
}
}
// Fallback for MinervaNeue
const header =
document.querySelector(".page-actions-menu") ||
document.querySelector(".minerva-header") ||
document.querySelector(".content");
if (header) {
const btn = document.createElement("button");
btn.className =
"cdx-button cdx-button--action-progressive wikidata-cleanup-launcher";
btn.textContent = label;
btn.addEventListener("click", clickHandler);
header.appendChild(btn);
}
}
function init() {
mw.util.addCSS(`
#WikidataCleanup_box table td:first-child,
#WikidataCleanup_box table th:first-child {
text-align: center;
padding-left: 0.25em;
padding-right: 0.25em;
}
`);
// Debug helpers (available in browser console)
window.wd_cleanup_debug = {
getObsoleteIdProps: () => Array.from(obsoleteIdProps),
getPropertyRegex: (pid) => propertyRegexCache.get(pid),
getPropertyUrlPatterns: (pid) => propertyUrlPatternsCache.get(pid),
getPropertyStatedIn: (pid) => propertyStatedInCache.get(pid),
getAllCaches: () => ({
obsoleteIdProps: Array.from(obsoleteIdProps),
propertyRegexCache: new Map(propertyRegexCache),
propertyUrlPatternsCache: new Map(propertyUrlPatternsCache),
propertyStatedInCache: new Map(propertyStatedInCache),
}),
searchRegexCache: (q) =>
[...propertyRegexCache.entries()]
.filter(([p, r]) => p.includes(q) || r.includes(q))
.map(([pid, regex]) => ({ pid, regex })),
searchUrlPatterns: (q) =>
[...propertyUrlPatternsCache.entries()]
.filter(([p]) => p.includes(q))
.map(([pid, patterns]) => ({ pid, patterns })),
testUrlAgainstPatterns: (url) => matchUrlAgainstPatterns(url),
};
addCleanupLink();
const settings = cache_loadSettings();
if (settings.autoStartPreview) initCleanupTool(settings);
}
mw.loader
.using(["mediawiki.util", "oojs-ui-core", "oojs-ui-widgets"])
.then(init);
})(mediaWiki);
// </nowiki>