Using node.js to scrape the Texas Department of Criminal Justice list of executed inmates from http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html
Background
I came across this list via reddit r/dataisbeautiful and a great word cloud derived from death row inmates’ last statements(written and spoken). This list would be a good exercise to scrape and wrangle the data into a usable JSON dataset.
Scrape, Transform, and Output
Dependencies
Install
npm install request --save
npm install cheerio --save
Get the Data
// Add modules
var request = require('request');
var $ = require('cheerio');
var fs = require('fs');
// State of Texas executed list
var url = 'http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html';
function Inmate() {
};
function parseRows(url, done) {
request(url, function(err, res, html) {
if (err) return log(err);
var parsedHtml = $.load(html);
var json = [];
parsedHtml('tr').map(function(i, elem) {
if (i > 0) {
var row = elem.children.filter(function(d) { return d.type === 'tag' });
var inmate = new Inmate;
row.forEach(function(item, index, arr) {
var len = arr.length;
attrInmate(index, item, inmate, function(d) {
json.push(d);
});
});
}
}).end(done(json));
});
}
function attrInmate(index, attr, inmate, done) {
if (index === 0) {
inmate.execution = attr.children[0].data;
}
if (index === 3) {
inmate.lastName = attr.children[0].data;
}
if (index === 4) {
inmate.firstName = attr.children[0].data;
}
if (index === 5) {
inmate.tdcjNumber = attr.children[0].data;
}
if (index === 6) {
inmate.age = attr.children[0].data;
}
if (index === 7) {
inmate.executionDate = attr.children[0].data;
}
if (index === 8) {
inmate.race = attr.children[0].data;
}
if (index === 9) {
inmate.county = attr.children[0].data;
done(inmate)
}
}
function log() {
var args = Array.prototype.slice.call(arguments);
args.forEach(function(item) {
console.log(item)
})
}
// parseRows('http://www.tdcj.state.tx.us/death_row/dr_info/villegasjoselast.html')
parseRows(url, function(json) {
var data = JSON.stringify(json, null, 2);
fs.writeFile('./executed.json', data, function(err) {
if (err) return log(err);
log('Finished')
})
});
Output executed.json
[
{
"execution": "515",
"lastName": "Villegas",
"firstName": "Jose",
"tdcjNumber": "999417",
"age": "39",
"executionDate": "04/16/2014",
"race": "Hispanic",
"county": "Nueces"
}, ...]
Checkout the full dataset on Github https://github.com/apburnes/texecuted