125 lines
3.9 KiB
JavaScript
Executable File
125 lines
3.9 KiB
JavaScript
Executable File
#! /usr/bin/env node
|
|
const fs = require("fs");
|
|
const chalk = require("chalk");
|
|
|
|
const { leads } = JSON.parse(fs.readFileSync("./leads.json"));
|
|
|
|
// index records by ID for easy lookup
|
|
const leadsById = {};
|
|
// index IDs by email for easy lookup
|
|
const leadIdssByEmail = {};
|
|
|
|
const collisions = [];
|
|
|
|
// deduplicate leads
|
|
for (const currentLead of leads) {
|
|
const { _id, email } = currentLead;
|
|
|
|
const collidingLeadIdByEmail = leadIdssByEmail[email];
|
|
const collidingLead = collidingLeadIdByEmail
|
|
? leadsById[collidingLeadIdByEmail]
|
|
: leadsById[_id];
|
|
|
|
if (collidingLead) {
|
|
const collision = {
|
|
left: collidingLead,
|
|
right: currentLead,
|
|
collidingField: collidingLeadIdByEmail ? "email" : "_id",
|
|
};
|
|
collisions.push(collision);
|
|
const lDate = new Date(collision.left.entryDate);
|
|
const rDate = new Date(collision.right.entryDate);
|
|
if (lDate > rDate) {
|
|
// existing lead is newer than current lead
|
|
// discard current lead by doing nothing with it
|
|
collision.took = "left";
|
|
|
|
const discardedValues = [...(collision.right.discardedValues || [])];
|
|
delete collision.right.discardedValues;
|
|
discardedValues.unshift(collision.right);
|
|
const lead = {
|
|
...collidingLead,
|
|
discardedValues,
|
|
};
|
|
leadsById[lead._id] = lead;
|
|
} else {
|
|
// current lead is newer than existing lead, or both leads have the same date
|
|
// either way, take the current lead over the existing one
|
|
collision.took = "right";
|
|
|
|
const discardedValues = [...(collision.left.discardedValues || [])];
|
|
delete collision.left.discardedValues;
|
|
discardedValues.unshift(collision.left);
|
|
const lead = {
|
|
...currentLead,
|
|
discardedValues,
|
|
};
|
|
|
|
// rewrite indices by which field collides
|
|
if (collision.collidingField === "_id") {
|
|
// colliding ID - replace ID index, delete old email in email index
|
|
delete leadIdssByEmail[collision.left.email];
|
|
leadIdssByEmail[email] = _id;
|
|
leadsById[_id] = lead;
|
|
} else {
|
|
// colliding email - replace ID in email index, delete old ID index
|
|
leadIdssByEmail[email] = _id;
|
|
delete leadsById[collision.left._id];
|
|
leadsById[_id] = lead;
|
|
}
|
|
}
|
|
} else {
|
|
// no collision
|
|
leadsById[currentLead._id] = currentLead;
|
|
leadIdssByEmail[currentLead.email] = currentLead._id;
|
|
}
|
|
}
|
|
|
|
const printPropRedIfDiff = (prop, val) => (object) =>
|
|
object[prop] !== val && console.log("\t\t", chalk.bgRed(object[prop]));
|
|
|
|
const prettyPrintItem = ({
|
|
_id,
|
|
email,
|
|
firstName,
|
|
lastName,
|
|
address,
|
|
entryDate,
|
|
discardedValues,
|
|
}) => {
|
|
console.log("------");
|
|
console.log("_id:\t\t", _id);
|
|
discardedValues && discardedValues.forEach(printPropRedIfDiff("_id", _id));
|
|
console.log("email:\t\t", email);
|
|
discardedValues &&
|
|
discardedValues.forEach(printPropRedIfDiff("email", email));
|
|
console.log("firstName:\t", firstName);
|
|
discardedValues &&
|
|
discardedValues.forEach(printPropRedIfDiff("firstName", firstName));
|
|
console.log("lastName:\t", lastName);
|
|
discardedValues &&
|
|
discardedValues.forEach(printPropRedIfDiff("lastName", lastName));
|
|
console.log("address:\t", address);
|
|
discardedValues &&
|
|
discardedValues.forEach(printPropRedIfDiff("address", address));
|
|
console.log("entryDate:\t", entryDate);
|
|
discardedValues &&
|
|
discardedValues.forEach(printPropRedIfDiff("entryDate", entryDate));
|
|
console.log();
|
|
};
|
|
|
|
Object.values(leadsById).forEach(prettyPrintItem);
|
|
|
|
fs.writeFileSync(
|
|
"./deduplicatedLeads.json",
|
|
JSON.stringify(Object.values(leadsById), null, 2)
|
|
);
|
|
|
|
console.log("records processed:", leads.length);
|
|
console.log("collisions:", collisions.length);
|
|
console.log("output leads:", Object.keys(leadsById).length);
|
|
console.log("leads written to deduplicatedLeads.json");
|
|
|
|
// // uncomment for more information about deduplication
|
|
// console.log("collisions", collisions);
|