125 lines
3.9 KiB
JavaScript
Executable File

#! /usr/bin/env node
const fs = require("fs");
const chalk = require("chalk");
const { leads } = JSON.parse(fs.readFileSync("./leads.json"));
// index records by ID for easy lookup
const leadsById = {};
// index IDs by email for easy lookup
const leadIdssByEmail = {};
const collisions = [];
// deduplicate leads
for (const currentLead of leads) {
const { _id, email } = currentLead;
const collidingLeadIdByEmail = leadIdssByEmail[email];
const collidingLead = collidingLeadIdByEmail
? leadsById[collidingLeadIdByEmail]
: leadsById[_id];
if (collidingLead) {
const collision = {
left: collidingLead,
right: currentLead,
collidingField: collidingLeadIdByEmail ? "email" : "_id",
};
collisions.push(collision);
const lDate = new Date(collision.left.entryDate);
const rDate = new Date(collision.right.entryDate);
if (lDate > rDate) {
// existing lead is newer than current lead
// discard current lead by doing nothing with it
collision.took = "left";
const discardedValues = [...(collision.right.discardedValues || [])];
delete collision.right.discardedValues;
discardedValues.unshift(collision.right);
const lead = {
...collidingLead,
discardedValues,
};
leadsById[lead._id] = lead;
} else {
// current lead is newer than existing lead, or both leads have the same date
// either way, take the current lead over the existing one
collision.took = "right";
const discardedValues = [...(collision.left.discardedValues || [])];
delete collision.left.discardedValues;
discardedValues.unshift(collision.left);
const lead = {
...currentLead,
discardedValues,
};
// rewrite indices by which field collides
if (collision.collidingField === "_id") {
// colliding ID - replace ID index, delete old email in email index
delete leadIdssByEmail[collision.left.email];
leadIdssByEmail[email] = _id;
leadsById[_id] = lead;
} else {
// colliding email - replace ID in email index, delete old ID index
leadIdssByEmail[email] = _id;
delete leadsById[collision.left._id];
leadsById[_id] = lead;
}
}
} else {
// no collision
leadsById[currentLead._id] = currentLead;
leadIdssByEmail[currentLead.email] = currentLead._id;
}
}
const printPropRedIfDiff = (prop, val) => (object) =>
object[prop] !== val && console.log("\t\t", chalk.bgRed(object[prop]));
const prettyPrintItem = ({
_id,
email,
firstName,
lastName,
address,
entryDate,
discardedValues,
}) => {
console.log("------");
console.log("_id:\t\t", _id);
discardedValues && discardedValues.forEach(printPropRedIfDiff("_id", _id));
console.log("email:\t\t", email);
discardedValues &&
discardedValues.forEach(printPropRedIfDiff("email", email));
console.log("firstName:\t", firstName);
discardedValues &&
discardedValues.forEach(printPropRedIfDiff("firstName", firstName));
console.log("lastName:\t", lastName);
discardedValues &&
discardedValues.forEach(printPropRedIfDiff("lastName", lastName));
console.log("address:\t", address);
discardedValues &&
discardedValues.forEach(printPropRedIfDiff("address", address));
console.log("entryDate:\t", entryDate);
discardedValues &&
discardedValues.forEach(printPropRedIfDiff("entryDate", entryDate));
console.log();
};
Object.values(leadsById).forEach(prettyPrintItem);
fs.writeFileSync(
"./deduplicatedLeads.json",
JSON.stringify(Object.values(leadsById), null, 2)
);
console.log("records processed:", leads.length);
console.log("collisions:", collisions.length);
console.log("output leads:", Object.keys(leadsById).length);
console.log("leads written to deduplicatedLeads.json");
// // uncomment for more information about deduplication
// console.log("collisions", collisions);