85 lines
2.6 KiB
JavaScript
Executable File
85 lines
2.6 KiB
JavaScript
Executable File
#! /usr/bin/env node
|
|
const fs = require("fs");
|
|
const { leads } = JSON.parse(fs.readFileSync("./leads.json"));
|
|
|
|
// index records by ID for easy lookup
|
|
const leadsById = {};
|
|
// index IDs by email for easy lookup
|
|
const leadIdssByEmail = {};
|
|
|
|
const collisions = [];
|
|
|
|
for (const currentLead of leads) {
|
|
const { _id, email } = currentLead;
|
|
|
|
const collidingLeadIdByEmail = leadIdssByEmail[email];
|
|
const collidingLead = collidingLeadIdByEmail
|
|
? leadsById[collidingLeadIdByEmail]
|
|
: leadsById[_id];
|
|
|
|
if (collidingLead) {
|
|
const collision = {
|
|
left: collidingLead,
|
|
right: currentLead,
|
|
collidingField: collidingLeadIdByEmail ? "email" : "_id",
|
|
};
|
|
collisions.push(collision);
|
|
const lDate = new Date(collision.left.entryDate);
|
|
const rDate = new Date(collision.right.entryDate);
|
|
if (lDate > rDate) {
|
|
// existing lead is newer than current lead
|
|
// discard current lead by doing nothing with it
|
|
collision.took = "left";
|
|
|
|
const discardedValues = [...(collision.right.discardedValues || [])];
|
|
delete collision.right.discardedValues;
|
|
discardedValues.push(collision.right);
|
|
const lead = {
|
|
...collidingLead,
|
|
discardedValues,
|
|
};
|
|
leadsById[lead._id] = lead;
|
|
} else {
|
|
// current lead is newer than existing lead, or both leads have the same date
|
|
// either way, take the current lead over the existing one
|
|
collision.took = "right";
|
|
|
|
const discardedValues = [...(collision.left.discardedValues || [])];
|
|
delete collision.left.discardedValues;
|
|
discardedValues.push(collision.left);
|
|
const lead = {
|
|
...currentLead,
|
|
discardedValues,
|
|
};
|
|
|
|
// rewrite indices by which field collides
|
|
if (collision.collidingField === "_id") {
|
|
// colliding ID - replace ID index, delete old email in email index
|
|
delete leadIdssByEmail[collision.left.email];
|
|
leadIdssByEmail[email] = _id;
|
|
leadsById[_id] = lead;
|
|
} else {
|
|
// colliding email - replace ID in email index, delete old ID index
|
|
leadIdssByEmail[email] = _id;
|
|
delete leadsById[collision.left._id];
|
|
leadsById[_id] = lead;
|
|
}
|
|
}
|
|
} else {
|
|
// no collision
|
|
leadsById[currentLead._id] = currentLead;
|
|
leadIdssByEmail[currentLead.email] = currentLead._id;
|
|
}
|
|
}
|
|
|
|
fs.writeFileSync(
|
|
"./deduplicatedLeads.json",
|
|
JSON.stringify(Object.values(leadsById), null, 2)
|
|
);
|
|
|
|
console.log("records processed:", leads.length);
|
|
console.log("collisions:", collisions.length);
|
|
console.log("output leads:", Object.keys(leadsById).length);
|
|
|
|
console.log("collisions", collisions);
|