Phrases extraction and D3 Wordcloud

JavaScript solution to extracting ngrams from reviews and visualise topics on wordcloud

100% JavaScript solution to extracting phrases from text and display key points in a beautiful D3 wordcloud. Phrases (or ngrams) extraction are done in phrase_extraction.js, adjustments to the wordcloud can be done in wordcloud.js. get the code from Github

Demo

Data

Data taken from datafiniti/Hotel Reviews.

Extract phrases

Parameters

Tweak extraction logic with the following parameters:

var atLeast = 2;       // Show results with at least .. occurrences
var numWords = 5;      // Show statistics for one to .. words
var ignoreCase = true; // Case-sensitivity
var REallowedChars = /[^a-zA-Z']+/g; // RE pattern to select valid characters. Invalid characters are replaced with a whitespace
var common = "poop,i,me,my,myself,we,us,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,whose,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,will,would,should,can,could,ought,i'm,you're,he's,she's,it's,we're,they're,i've,you've,we've,they've,i'd,you'd,he'd,she'd,we'd,they'd,i'll,you'll,he'll,she'll,we'll,they'll,isn't,aren't,wasn't,weren't,hasn't,haven't,hadn't,doesn't,don't,didn't,won't,wouldn't,shan't,shouldn't,can't,cannot,couldn't,mustn't,let's,that's,who's,what's,here's,there's,when's,where's,why's,how's,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,upon,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,say,says,said,shall";

Logic

  • load text
  • initalise empty hash to store all possible phrases
  • clean text with only valid characters
  • create phrases hash
  • count phrases set
  • sort phrases set

Code

function get_top_phrases(_list_text, top_n, remove_these){

  var common = "poop,i,me,my,myself,we,us,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,whose,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,will,would,should,can,could,ought,i'm,you're,he's,she's,it's,we're,they're,i've,you've,we've,they've,i'd,you'd,he'd,she'd,we'd,they'd,i'll,you'll,he'll,she'll,we'll,they'll,isn't,aren't,wasn't,weren't,hasn't,haven't,hadn't,doesn't,don't,didn't,won't,wouldn't,shan't,shouldn't,can't,cannot,couldn't,mustn't,let's,that's,who's,what's,here's,there's,when's,where's,why's,how's,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,upon,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,say,says,said,shall";

  var text = "";
  for(var i in _list_text){
    text+= _list_text[i]+" ";
  }

  var atLeast = 2;       // Show results with at least .. occurrences
  var numWords = 5;      // Show statistics for one to .. words
  var ignoreCase = true; // Case-sensitivity
  var REallowedChars = /[^a-zA-Z']+/g;
   // RE pattern to select valid characters. Invalid characters are replaced with a whitespace

  var i, j, k, textlen, len, s;
  // Prepare key hash
  var keys = [null]; //"keys[0] = null", a word boundary with length zero is empty
  var results = [];
  var phrase_count = [];
  numWords++; //for human logic, we start counting at 1 instead of 0
  for (i=1; i<=numWords; i++) {
      keys.push({});
  }

  // Remove all irrelevant characters
  text = text.replace(REallowedChars, " ").replace(/^\s+/,"").replace(/\s+$/,"");

  // Create a hash
  if (ignoreCase) text = text.toLowerCase();
  text = text.split(/\s+/);
  for (i=0, textlen=text.length; i<textlen; i++) {
      s = text[i];
      keys[1][s] = (keys[1][s] || 0) + 1;
      for (j=2; j<=numWords; j++) {
          if(i+j <= textlen) {
              s += " " + text[i+j-1];
              keys[j][s] = (keys[j][s] || 0) + 1;
          } else break;
      }
  }

  // Prepares results
  for (var k=1; k<=numWords; k++) {
      results[k] = [];
      var key = keys[k];
      for (var i in key) {

        var is_ok = true;

        var words = i.split(/[ '\-\(\)\*":;\[\]|{},.!?]+/);

        words.forEach(function(word){
          var word = word.toLowerCase();
          if (word != "" && common.indexOf(word)==-1 && word.length>1){

          }else{
            is_ok = false;
          }
        })

        if(is_ok){
          if(key[i] >= atLeast && words.length > 1){
            results[k].push({"word":words.join(" "), "count":key[i]});
            phrase_count.push({"word":words.join(" "), "count":key[i]});
          }
        }

      }
  }

  var f_sortAscending = function(x,y) {return y.count - x.count;};
  for (k=2; k<numWords; k++) {
    results[k].sort(f_sortAscending);

    var words = results[k];
    if (words.length){
      for (i=0,len=words.length; i<len; i++) {
        phrase_count[words[i].word] = words[i].count;
      }
    }

  }

  var list_phrases = [];
  var vocab = []
  var phrases_count = {};
  var phrase_count_sorted = phrase_count.sort(f_sortAscending)
  for(var i=0;i<top_n;i++){
    var phrase = phrase_count_sorted[i].word;
    phrases_count[phrase_count_sorted[i].word] = phrase_count_sorted[i].count;
    list_phrases.push(phrase);
  }


  var phrases_onehot = phrases_to_onehot(list_phrases)
  var phrases_remove_subset = remove_subset(list_phrases, phrases_onehot)

  var output = {};
  for(var p in phrases_remove_subset){
    if(phrases_remove_subset[p] && remove_these.indexOf(p))
      output[p] = phrases_count[p];
  }

  return output;
}



//////////////////

function fillArray(value, len) {
  if (len == 0) return [];
  var a = [value];
  while (a.length * 2 <= len) a = a.concat(a);
  if (a.length < len) a = a.concat(a.slice(0, len - a.length));
  return a;
}

function a_is_subset_of_b (a, b) {
  var a_b_same = true;
  for(var i in a){
    var t = a[i] - b[i];
    if(t == 1){
      a_b_same = false;
    }
  }
  return a_b_same;
};

function phrases_to_onehot(list_phrases){
  var row = [];
  var vocab = [];

  for(var i=0;i<list_phrases.length;i++){
    var words = list_phrases[i].split(/[ '\-\(\)\*":;\[\]|{},.!?]+/);
    words.forEach(function(word){
      if(vocab.indexOf(word)==-1){
        vocab.push(word);
      }
    })
  }
  for(var i=0;i<list_phrases.length;i++){
    var this_row = fillArray(0, vocab.length);
    var words = list_phrases[i].split(/[ '\-\(\)\*":;\[\]|{},.!?]+/);
    words.forEach(function(word){
      this_row[vocab.indexOf(word)] = 1;
    });
    row.push(this_row);
  }
  return row;
}

function remove_subset(list_phrases, row){
  var results = {};

  for(var i=0;i<row.length;i++){
    var this_is_unique = true;
    for(var j=0;j<row.length;j++){
      if(i!=j && a_is_subset_of_b(row[i], row[j])){
        // console.log(row[i], row[j], a_is_subset_of_b(row[i], row[j]));
        this_is_unique = false;
      }
    }
    results[list_phrases[i]] = this_is_unique
    // console.log(list_phrases[i], row[i], this_is_unique)
  }
  return results;
}

Generate wordcloud

Parameters

var fill_colors = ["#29e8c4", "#29e8d8", "#29e4e8", "#29cbe8"]; // color determine by frequency of phrases

Code

function make_wordcloud(word_count){

  var fill_colors = ["#29e8c4", "#29e8d8", "#29e4e8", "#29cbe8"];

  drawWordCloud(word_count, "#chart");

  function drawWordCloud(word_count, div_id){

    var width = $(document).width();
    var height = $(document).height();
    var word_entries = d3.entries(word_count);

    var xScale = d3.scale.linear()
     .domain([0, d3.max(word_entries, function(d) {
        return d.value;
      })
     ])
     .range([10,70]);

    d3.layout.cloud().size([width, height])
      .timeInterval(200)
      .words(word_entries)
      .fontSize(function(d) { return xScale(+d.value); })
      .text(function(d) { return d.key; })
      .rotate(function() {
        // return ~~(Math.random() * 2) * 90;
        return 0; // comment this and un-comment above to have 90 degree rotations
      })
      .font("'helvetica neue', helvetica, arial, sans-serif")
      .on("end", draw)
      .start();

    function draw(words) {

      var svg = d3.select(div_id).append("svg")
        .attr("width", width)
        .attr("height", height)
        .append("g")
        .attr("transform", "translate(" + [width >> 1, height >> 1] + ")")
        ;

      var words = svg.selectAll("text")
        .data(words)
        .enter().append("text")
        .style("font-size", function(d) { return xScale(d.value) + "px"; })
        .style("font-family", "'helvetica neue', helvetica, arial, sans-serif")
        .style("font-weight", 100)
        .style("fill", function(d, i) { return fill_colors[i%4]; })
        .attr("text-anchor", "middle")
        .attr("transform", function(d) {
          return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
        })
        .text(function(d) { return d.key; })
        ;

    }

    d3.layout.cloud().stop();
  }
}

Thanks for reading!