#! /usr/bin/env node const fs = require("fs"); const chalk = require("chalk"); const { leads } = JSON.parse(fs.readFileSync("./leads.json")); // index records by ID for easy lookup const leadsById = {}; // index IDs by email for easy lookup const leadIdssByEmail = {}; const collisions = []; // deduplicate leads for (const currentLead of leads) { const { _id, email } = currentLead; const collidingLeadIdByEmail = leadIdssByEmail[email]; const collidingLead = collidingLeadIdByEmail ? leadsById[collidingLeadIdByEmail] : leadsById[_id]; if (collidingLead) { const collision = { left: collidingLead, right: currentLead, collidingField: collidingLeadIdByEmail ? "email" : "_id", }; collisions.push(collision); const lDate = new Date(collision.left.entryDate); const rDate = new Date(collision.right.entryDate); if (lDate > rDate) { // existing lead is newer than current lead // discard current lead by doing nothing with it collision.took = "left"; const discardedValues = [...(collision.right.discardedValues || [])]; delete collision.right.discardedValues; discardedValues.unshift(collision.right); const lead = { ...collidingLead, discardedValues, }; leadsById[lead._id] = lead; } else { // current lead is newer than existing lead, or both leads have the same date // either way, take the current lead over the existing one collision.took = "right"; const discardedValues = [...(collision.left.discardedValues || [])]; delete collision.left.discardedValues; discardedValues.unshift(collision.left); const lead = { ...currentLead, discardedValues, }; // rewrite indices by which field collides if (collision.collidingField === "_id") { // colliding ID - replace ID index, delete old email in email index delete leadIdssByEmail[collision.left.email]; leadIdssByEmail[email] = _id; leadsById[_id] = lead; } else { // colliding email - replace ID in email index, delete old ID index leadIdssByEmail[email] = _id; delete leadsById[collision.left._id]; leadsById[_id] = lead; } } } else { // no collision leadsById[currentLead._id] = currentLead; leadIdssByEmail[currentLead.email] = currentLead._id; } } const printPropRedIfDiff = (prop, val) => (object) => object[prop] !== val && console.log("\t\t", chalk.bgRed(object[prop])); const prettyPrintItem = ({ _id, email, firstName, lastName, address, entryDate, discardedValues, }) => { console.log("------"); console.log("_id:\t\t", _id); discardedValues && discardedValues.forEach(printPropRedIfDiff("_id", _id)); console.log("email:\t\t", email); discardedValues && discardedValues.forEach(printPropRedIfDiff("email", email)); console.log("firstName:\t", firstName); discardedValues && discardedValues.forEach(printPropRedIfDiff("firstName", firstName)); console.log("lastName:\t", lastName); discardedValues && discardedValues.forEach(printPropRedIfDiff("lastName", lastName)); console.log("address:\t", address); discardedValues && discardedValues.forEach(printPropRedIfDiff("address", address)); console.log("entryDate:\t", entryDate); discardedValues && discardedValues.forEach(printPropRedIfDiff("entryDate", entryDate)); console.log(); }; Object.values(leadsById).forEach(prettyPrintItem); fs.writeFileSync( "./deduplicatedLeads.json", JSON.stringify(Object.values(leadsById), null, 2) ); console.log("records processed:", leads.length); console.log("collisions:", collisions.length); console.log("output leads:", Object.keys(leadsById).length); console.log("leads written to deduplicatedLeads.json"); // // uncomment for more information about deduplication // console.log("collisions", collisions);