NodeJS JavaScript Email Parsing

If you use NodeJS or JavaScript and need to parse emails then this guide is for you. Learn how to scrape email signatures with NodeJS and helpful libraries that make it easy to pull emails.
Get a FREE demo and trial of SigParser
No commitment required


In this article, we’ll cover:

Why parsing emails is hard

Splitting Email Chains

If you want to split emails on the headers with NodeJS, that is tough. No two email clients seem to produce the same header format and over time the email clients change the way they format headers.

Signature and Contact Detail Detection

Parsing email signatures with NodeJS is also difficult. Many think they can just use a couple regex expressions and they'll be done. But the more you start working the problem the harder it becomes. Here are some of the major things you'd need to handle.

  • No signature is formatted the same
  • Phone numbers can have many different formats
  • Need to attribute the type of phone number (Fax vs Mobile vs Work phones).
  • The phone type indicator has lots of variations. For example, Mobile vs M: vs Cell vs C: and many others
  • Titles can be incredibly difficult to capture without getting too much wrong information.
  • Locations are tough. Very few people put full addresses. Often they'll only put the city and state but no country. Even street addresses are massively different by country.

Then there is identifying where the email signature is in the email which is really hard. We use a machine learning algorithm with lots of labeled validation emails. We've been labeling our test set for years across many organizations.

This is why we expose a simple email parsing API for use with NodeJS

All of the above is why we suggest using our email parsing service to parse emails from NodeJS. As we improve you'll automatically get any improvements without the need to redeploy any of your code. If an email client like Gmail starts using a new reply header format, we'll have a fix deployed with days and you won't have to do anything.

The SigParser Email Parsing API

The SigParser Email Parsing API is a serverless, stateless email parsing API. It can extract contacts and split emails into sections. It can find phone numbers, titles, addresses and attribute them to the correct contact. It even takes care of deduping contacts for you if the same email address appears in the email.

The API is is stateless. We store nothing about the email. It is a processing only service. We only store some high level statistics.

Example: Parse an Email with NodeJS

var request = require("request");

var options = { 
    method: 'POST',
    url: 'https://ipaas.sigparser.com/api/Parse/Email/Message/JSON', // OR https://ipaas.sigparser.com/api/Parse/Email/Contact/JSON
    headers: 
    {   'cache-control': 'no-cache',
        'x-api-key': '232323232323',
        'content-type': 'application/json' },
    body: { from_address: 'jsmith@example.com', from_name: 'John Smith', plainbody: "This is the body of the email." },
    json: true 
};

request(options, function (error, response, body) {
    if (error) throw new Error(error);
    console.log(body);
});

And you’ll get a response like

{
    "error": "string",
    "contacts": [
      {
        "firstName": "string",
        "lastName": "string",
        "emailAddress": "string",
        "emailAddressDomain": "string",
        "emailAddressDomainWithoutTLD": "string",
        "phoneNumber": "string",
        "mobilePhone": "string",
        "voipPhone": "string",
        "officePhone": "string",
        "fax": "string",
        "address": "string",
        "title": "string",
        "twitterUrl": "string",
        "twitterHandle": "string",
        "linkedInUrl": "string",
        "linkedInHandle": "string",
        "companyName": "string",
        "website": "string"
      }
    ],
    "isSpammyLookingEmailMessage": true,
    "isSpammyLookingSender": true,
    "isSpam": true,
    "from_LastName": "string",
    "from_FirstName": "string",
    "from_Fax": "string",
    "from_Phone": "string",
    "from_Address": "string",
    "from_Title": "string",
    "from_MobilePhone": "string",
    "from_OfficePhone": "string",
    "from_LinkedInUrl": "string",
    "from_TwitterUrl": "string",
    "from_TwitterHandle": "string",
    "from_EmailAddress": "string",
    "emails": [
      {
        "from_EmailAddress": "string",
        "from_Name": "string",
        "textBody": "string",
        "htmlLines": [
          "string"
        ],
        "date": "2019-05-05T22:27:56.124Z",
        "didParseCorrectly": true,
        "to": [
          {
            "name": "string",
            "emailAddress": "string"
          }
        ],
        "cc": [
          {
            "name": "string",
            "emailAddress": "string"
          }
        ],
        "htmlBody": "string",
        "spammyLookingEmail": true,
        "subject": "string",
        "cleanedBodyHtml": "string",
        "cleanedBodyPlain": "string"
      }
    ],
    "from_LinkedInHandle": "string",
    "duration": 0,
    "cleanedemailbody": "string",
    "cleanedemailbody_ishtml": true,
    "cleanedemailbody_plain": "string",
    "from_CompanyName": "string",
    "from_Website": "string",
    "from_EmailAddressDomain": "string",
    "from_EmailAddressDomainWithoutTLD": "string"
  }

Email Libraries to Pulling Emails

Before you can parse an email with NodeJS and JavaScript, you’ll need to fetch emails. Each email server type requires it’s own set of libraries, conventions and rules for fetching emails.

We have not validated any of these for security so you should review the security of all of these yourself before using them.

Gmail General Advice

There are two ways to access Google email. First is just IMAP which we’ll detail down below. The other is the Gmail APIs. You can also use IMAP but with OAuth authetication.

In order to use OAuth, there are some requirements you should be aware of. For OAuth, if you’re only having internal email accounts connect then it is pretty easy to setup an API key. If you need external accounts to connect via OAuth then you will have to do a security audit with Google.

For IMAP you could just have users create app passwords. But assuming you have all that figured out, here are some libraries for access Google emails.

Google: npm googleapis

The official Google API library

const {google} = require('googleapis');

const gmail = google.gmail({
    version: 'v1',
    auth: 'YOUR API KEY'
});

const res = await gmail.users.watch({
  userId: 'me',
  requestBody: {
    // Replace with `projects/${PROJECT_ID}/topics/${TOPIC_NAME}`
    topicName: `projects/el-gato/topics/gmail`
  }
});
console.log(res.data);

Office 365: npm @microsoft/microsoft-graph-client

This is kind of a difficult API to get your head wrapped around at first.

The NPM library usage documentation can be found here.

To really understand the API you should use the graph explorer. Without this it is hard to know what is available and really how it works.

The Office 365 Mail Message API Spec documentation.

import { Client } from "@microsoft/microsoft-graph-client";

// todo: Setup your authProvider. See the npm page

const options = {
    authProvider, // An instance created from previous step
};
const client = Client.initWithMiddleware(options);

// Make request
try {
    let userDetails = await client.api("/me").get();
    console.log(userDetails);
} catch (error) {
    throw error;
}

Implementation Advice

  • This package just helps you compose ODATA formatted requests.
  • If you need to sync emails every X hours or even on an initial pull, be sure to use the “delta” endpoint.
    • You can only sort the delta sync by “receivedDateTime desc” otherwise the results will be random.
  • If you try to sync by just querying messages newer than some date using the “List messages” endpoint, you could encounter the following errors:
    • Timeouts if the mailbox is too large
    • Missing messages

Exchange

We don’t have any suggestions for Exchange as most companies these days are moving to Office 365. But if you’re looking for packages, you should search for “Exchange Web Services” or “EWS” which is what it is normally called. Some of the ones we found were:

IMAP: npm imap

Extremely popular library for fetching emails. This is an old format but almost every email server supports this including Gmail.

var Imap = require('imap'),
    inspect = require('util').inspect;
 
var imap = new Imap({
  user: 'mygmailname@gmail.com',
  password: 'mygmailpassword',
  host: 'imap.gmail.com',
  port: 993,
  tls: true
});
 
function openInbox(cb) {
  imap.openBox('INBOX', true, cb);
}
 
imap.once('ready', function() {
  openInbox(function(err, box) {
    if (err) throw err;
    var f = imap.seq.fetch('1:3', {
      bodies: 'HEADER.FIELDS (FROM TO SUBJECT DATE)',
      struct: true
    });
    f.on('message', function(msg, seqno) {
      console.log('Message #%d', seqno);
      var prefix = '(#' + seqno + ') ';
      msg.on('body', function(stream, info) {
        var buffer = '';
        stream.on('data', function(chunk) {
          buffer += chunk.toString('utf8');
        });
        stream.once('end', function() {
          console.log(prefix + 'Parsed header: %s', inspect(Imap.parseHeader(buffer)));
        });
      });
      msg.once('attributes', function(attrs) {
        console.log(prefix + 'Attributes: %s', inspect(attrs, false, 8));
      });
      msg.once('end', function() {
        console.log(prefix + 'Finished');
      });
    });
    f.once('error', function(err) {
      console.log('Fetch error: ' + err);
    });
    f.once('end', function() {
      console.log('Done fetching all messages!');
      imap.end();
    });
  });
});
 
imap.once('error', function(err) {
  console.log(err);
});
 
imap.once('end', function() {
  console.log('Connection ended');
});
 
imap.connect();

Try SigParser for FREE

Try SigParser for FREE with no commitment. Schedule a 15 minute web conference to get an overiew of SigParser and set up with a free trial account. No commitment required.