From d4ce905d1ac350f4f3ae3171bedd556097d0bcf7 Mon Sep 17 00:00:00 2001 From: Chuck Dries Date: Mon, 9 Aug 2021 12:25:47 -0700 Subject: [PATCH] initial deduplication implementation --- .gitignore | 1 + index.js | 59 +++++++++++++++++++++++++++++++++++++ leads.json | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ package.json | 11 +++++++ 4 files changed, 153 insertions(+) create mode 100644 .gitignore create mode 100644 index.js create mode 100644 leads.json create mode 100644 package.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..496ee2c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 0000000..d51a185 --- /dev/null +++ b/index.js @@ -0,0 +1,59 @@ +const fs = require("fs"); +const { leads } = JSON.parse(fs.readFileSync("./leads.json")); + +// index records by ID for easy lookup +const leadsById = {}; +// index IDs by email for easy lookup +const leadIdssByEmail = {}; + +const collisions = []; + +for (const currentLead of leads) { + const { _id, email } = currentLead; + const collidingLeadIdByEmail = leadIdssByEmail[email]; + const collidingLead = collidingLeadIdByEmail + ? leadsById[collidingLeadIdByEmail] + : leadsById[_id]; + + if (collidingLead) { + const collision = { + left: collidingLead, + right: currentLead, + collidingField: collidingLeadIdByEmail ? "email" : "_id", + }; + collisions.push(collision); + const lDate = new Date(collision.left.entryDate); + const rDate = new Date(collision.right.entryDate); + if (lDate > rDate) { + // existing lead is newer than current lead + // discard current lead by doing nothing with it + collision.took = "left"; + } else { + // current lead is newer than existing lead, or both leads have the same date + // either way, take the current lead over the existing one + collision.took = "right"; + if (collision.collidingField === "_id") { + // colliding ID - replace ID index, delete old email in email index + delete leadIdssByEmail[collision.left.email]; + leadIdssByEmail[email] = _id; + leadsById[_id] = currentLead; + } else { + // colliding email - replace ID in email index, delete old ID index + leadIdssByEmail[email] = _id; + delete leadsById[collision.left._id]; + leadsById[_id] = currentLead; + } + } + } else { + // no collision + leadsById[currentLead._id] = currentLead; + leadIdssByEmail[currentLead.email] = currentLead._id; + } +} + +console.log("collisions", collisions); +console.log("leadsById", leadsById); + +console.log('records processed:', leads.length) +console.log('collisions:', collisions.length) +console.log('output leads:', Object.keys(leadsById).length) \ No newline at end of file diff --git a/leads.json b/leads.json new file mode 100644 index 0000000..8444c57 --- /dev/null +++ b/leads.json @@ -0,0 +1,82 @@ +{"leads":[ +{ +"_id": "jkj238238jdsnfsj23", +"email": "foo@bar.com", +"firstName": "John", +"lastName": "Smith", +"address": "123 Street St", +"entryDate": "2014-05-07T17:30:20+00:00" +}, +{ +"_id": "edu45238jdsnfsj23", +"email": "mae@bar.com", +"firstName": "Ted", +"lastName": "Masters", +"address": "44 North Hampton St", +"entryDate": "2014-05-07T17:31:20+00:00" +}, +{ +"_id": "wabaj238238jdsnfsj23", +"email": "bog@bar.com", +"firstName": "Fran", +"lastName": "Jones", +"address": "8803 Dark St", +"entryDate": "2014-05-07T17:31:20+00:00" +}, +{ +"_id": "jkj238238jdsnfsj23", +"email": "coo@bar.com", +"firstName": "Ted", +"lastName": "Jones", +"address": "456 Neat St", +"entryDate": "2014-05-07T17:32:20+00:00" +}, +{ +"_id": "sel045238jdsnfsj23", +"email": "foo@bar.com", +"firstName": "John", +"lastName": "Smith", +"address": "123 Street St", +"entryDate": "2014-05-07T17:32:20+00:00" +}, +{ +"_id": "qest38238jdsnfsj23", +"email": "foo@bar.com", +"firstName": "John", +"lastName": "Smith", +"address": "123 Street St", +"entryDate": "2014-05-07T17:32:20+00:00" +}, +{ +"_id": "vug789238jdsnfsj23", +"email": "foo1@bar.com", +"firstName": "Blake", +"lastName": "Douglas", +"address": "123 Reach St", +"entryDate": "2014-05-07T17:33:20+00:00" +}, +{ +"_id": "wuj08238jdsnfsj23", +"email": "foo@bar.com", +"firstName": "Micah", +"lastName": "Valmer", +"address": "123 Street St", +"entryDate": "2014-05-07T17:33:20+00:00" +}, +{ +"_id": "belr28238jdsnfsj23", +"email": "mae@bar.com", +"firstName": "Tallulah", +"lastName": "Smith", +"address": "123 Water St", +"entryDate": "2014-05-07T17:33:20+00:00" +}, +{ +"_id": "jkj238238jdsnfsj23", +"email": "bill@bar.com", +"firstName": "John", +"lastName": "Smith", +"address": "888 Mayberry St", +"entryDate": "2014-05-07T17:33:20+00:00" +}] +} \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..2c405b6 --- /dev/null +++ b/package.json @@ -0,0 +1,11 @@ +{ + "name": "adobe-coding-challenge", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "", + "license": "ISC" +}