Store discarded values on leads and write to file

This commit is contained in:
Chuck Dries 2021-08-09 12:43:13 -07:00
parent d4ce905d1a
commit 7453de7257
2 changed files with 33 additions and 8 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
.DS_Store .DS_Store
deduplicatedLeads.json

View File

@ -10,6 +10,7 @@ const collisions = [];
for (const currentLead of leads) { for (const currentLead of leads) {
const { _id, email } = currentLead; const { _id, email } = currentLead;
const collidingLeadIdByEmail = leadIdssByEmail[email]; const collidingLeadIdByEmail = leadIdssByEmail[email];
const collidingLead = collidingLeadIdByEmail const collidingLead = collidingLeadIdByEmail
? leadsById[collidingLeadIdByEmail] ? leadsById[collidingLeadIdByEmail]
@ -28,20 +29,39 @@ for (const currentLead of leads) {
// existing lead is newer than current lead // existing lead is newer than current lead
// discard current lead by doing nothing with it // discard current lead by doing nothing with it
collision.took = "left"; collision.took = "left";
const discardedValues = [...(collision.right.discardedValues || [])];
delete collision.right.discardedValues;
discardedValues.push(collision.right);
const lead = {
...collidingLead,
discardedValues,
};
leadsById[lead._id] = lead;
} else { } else {
// current lead is newer than existing lead, or both leads have the same date // current lead is newer than existing lead, or both leads have the same date
// either way, take the current lead over the existing one // either way, take the current lead over the existing one
collision.took = "right"; collision.took = "right";
const discardedValues = [...(collision.left.discardedValues || [])];
delete collision.left.discardedValues;
discardedValues.push(collision.left);
const lead = {
...currentLead,
discardedValues,
};
// rewrite indices by which field collides
if (collision.collidingField === "_id") { if (collision.collidingField === "_id") {
// colliding ID - replace ID index, delete old email in email index // colliding ID - replace ID index, delete old email in email index
delete leadIdssByEmail[collision.left.email]; delete leadIdssByEmail[collision.left.email];
leadIdssByEmail[email] = _id; leadIdssByEmail[email] = _id;
leadsById[_id] = currentLead; leadsById[_id] = lead;
} else { } else {
// colliding email - replace ID in email index, delete old ID index // colliding email - replace ID in email index, delete old ID index
leadIdssByEmail[email] = _id; leadIdssByEmail[email] = _id;
delete leadsById[collision.left._id]; delete leadsById[collision.left._id];
leadsById[_id] = currentLead; leadsById[_id] = lead;
} }
} }
} else { } else {
@ -51,9 +71,13 @@ for (const currentLead of leads) {
} }
} }
console.log("collisions", collisions); fs.writeFileSync(
console.log("leadsById", leadsById); "./deduplicatedLeads.json",
JSON.stringify(Object.values(leadsById), null, 2)
);
console.log('records processed:', leads.length) console.log("records processed:", leads.length);
console.log('collisions:', collisions.length) console.log("collisions:", collisions.length);
console.log('output leads:', Object.keys(leadsById).length) console.log("output leads:", Object.keys(leadsById).length);
console.log("collisions", collisions);