CouchDB cleanup script for purging old docs

CouchDB does not have straightforward ways to clean up old data. This is one simple way do delete entries by date, but it requires that

  • Your documents have date or timestamp property
  • There is view for each database to fetch documents for that property

Prerequisities

  1. Node.js
  2. jss module, i.e. ‘npm install jss’.

1. Prepare views for Cleanup

Define view in each database that needs regular cleanup. Use something like this where emitted key field is timestamp in seconds.

views: {
    created: {
	map: function(doc) {
		if ( doc.created ) {
			emit(doc.created, doc._rev);
		}
	}
    }
    ....
}

2. The Cleanup script

Script queries old doc ids from the cleanup view and marks them as deleted. Documents are not deleted immediately but are removed physically on next CouchDB compact. CouchDB 1.2.0 supports autocompact so just enable it and don’t worry about it.

#!/bin/bash

DBHOST=localhost

# Get key for entries that are over 6 months old. This assumes that created view can be queried using timestamps as keys.
if uname -a | grep -i darwin > /dev/null
then
	TODAY=$(date '+%Y-%m-%d')
	MONTHSAGO=$(date -v -24w '+%Y-%m-%d')
	MONTHSAGO_E=$(date -v -24w '+%s')
else
	TODAY=$(date '+%Y-%m-%d')
	MONTHSAGO=$(date -d '24 weeks ago' '+%Y-%m-%d')
	MONTHSAGO_E=$(date -d '24 weeks ago' '+%s')
fi

PATH=$PATH:/usr/local/bin

# JSON scripting tool
JSS=$(npm bin)/jss

cleanup() {
	DATABASE=$1
	DESIGN=$2

	echo "Cleaning $DATABASE/$DESIGN"
	curl --silent -S http://$DBHOST:5984/$DATABASE/_design/$DESIGN/_view/created?endkey=$MONTHSAGO_E | \
		$JSS --bulk_docs '$.id' '{_id: $.id, _rev:$.value, _deleted:true}' | \
		curl --silent -S -X POST -d @-  -H "Content-Type:application/json" http://$HOST:$PORT/$DATABASE/_bulk_docs | \
		sed 's/\({[^}]*}\),/\1\n/g' | tr -d '[]' | \
		$JSS '$.ok != true'
}

echo "STATS CLEANUP <= $MONTHSAGO - Start" `date`

# Put databases and views here
cleanup somedb1 someview
cleanup somedb1 otherview
cleanup somedb2 alsoview

echo "STATS CLEANUP - Done" `date`

The script does this

  1. Get expired docs, e.g. curl ‘http://localhost:5984/mydatabase/_design/mydesign/_view/created?endkey=1337049581&#8217;
  2. Build bulk doc delete request (jss)
  3. Issue delete bulk request (curl post)
  4. sanitize couchdb output, i.e. add newlines and remove brackets (sed)
  5. print failed ones

Note that default version of jss doesn’t output proper JSON if no documents are found, use my fork to workaround this problem if you dont want to see errors in logs.

npm install https://github.com/tikonen/jss/tarball/master

Keeping CouchDB design docs up to date with Node.js

CouchDB views are defined typically as Javascript snippets and are part of special documents called design documents. I noticed that keeping these design documents up to date during development is pretty cumbersome and error prone. So I devised simply way to keep them updated using Node.js and Cradle couchdb driver.

Idea is to define the views in as variables in runnable js script and run that with Node each time it’s changed.

Here is the code. Copy it to e.g. cdb-views.js.

var cradle = require('cradle');

cradle.setup({ host: 'localhost',
               port: 5984,
               options: { cache:true, raw: false }});

var cclient = new (cradle.Connection)

function _createdb(dbname) {
    var db = cclient.database(dbname);
    db.exists(function(err, exists) {
        if (!exists) {
            db.create()
        }
    });
    return db;
}
var DB_SOMETHING = _createdb('somedb')

function cradle_error(err, res) {
    if (err) console.log(err)
}


function update_views( db, docpath, code ) {

    function save_doc() {
        db.save(docpath, code, function(err) {
            // view has changed, so initiate cleanup to get rid of old
            // indexes
            db.viewCleanup( cradle_error );
        });

        return true;
    }

    function compare_code( str1, str2 ) {
        var p1 = str1.split('\n');
        var p2 = str2.split('\n');

        for ( var i=0; i < p1.length || i < p2.length; i++ ) {
            var l1 = p1[i];
            var l2 = p2[i];
            l1 = l1 ? l1.trim() : '';
            l2 = l2 ? l2.trim() : '';
            if ( !l1 && !l2 ) continue;
            if ( l1 != l2 ) return true;
        }
        return false;
    }

    // compare function definitions in document and in code
    function compare_def(docdef, codedef) {
        var i = 0;

        if (!docdef && codedef) {
            console.log('creating "' + docpath +'"')
            return true;
        }
        if (!codedef && docdef) {
            console.log('removing "' + docpath +'"')
            return true;
        }
        if (!codedef && !docdef) {
            return false;
        }

        for (var u in docdef) {

            i++;
            if (codedef[u] == undefined) {
                console.log('definition of "' + u + '" removed - updating "' + docpath +'"')
                return true;
            }

            if (typeof(codedef[u]) == 'function') {
                if (!codedef[u] || compare_code( docdef[u], codedef[u].toString()) ) {
                    console.log('definition of "' + u + '" changed - updating "' + docpath +'"')
                    return true;
                }
            } else for (var f in docdef[u]) {
                i++;
                if (!codedef[u][f] || compare_code( docdef[u][f], codedef[u][f].toString()) ) {
                    console.log('definition of "' + u + '.' + f + '" changed - updating "' + docpath +'"')
                    return true;
                }

            }
        }
        // check that both doc and code have same number of functions
        for (var u in codedef) {
            i--;
            if (typeof(codedef[u]) != 'function') {
                for (var f in codedef[u]) {
                    i--;
                }
            }
        }
        if (i != 0) {
            console.log('new definitions - updating "' + docpath +'"')
            return true;
        }

        return false;
    }

    db.get(docpath, function(err, doc) {

        if (!doc) {
            console.log('not found - creating "' + docpath +'"')
            return save_doc();
        }

        if (compare_def(doc.updates, code.updates) || compare_def(doc.views, code.views)) {
            return save_doc();
        }
        console.log('"' + docpath +'" up to date')
    });
}

var EXAMPLE1_DDOC = {
    language: 'javascript',
    views: {
        active: {
            map: function (doc) {
                if (doc.lastsession) {
                    emit(parseInt(doc.lastsession / 1000), 1)
                }
            },
            reduce: function(keys, counts, rereduce) {
                return sum(counts)
            }
        },
        users: function(doc) { 
            if (doc.created) {
                emit(parseInt(doc.created / 1000), 1)
            }
        }
    }    
}

var EXAMPLE2_DDOC = {
    language: 'javascript',
    views: {
        myview: function(doc) {
            if (doc.param1 && doc.param2) {
                emit([doc.param1, doc.param2], null)
            }
        }
    }
}

update_views(DB_SOMETHING, '_design/example1', EXAMPLE1_DDOC);
update_views(DB_SOMETHING, '_design/example2', EXAMPLE2_DDOC);

The code is pretty simple.

  1. First it loads the Cradle couchdb driver and creates needed databases if they do not already exist. In this example only single database ‘somedb’ is created.
  2. The update_views is responsible of keeping the design docs up to date. It loads the design doc from defined DB and compares it to the code defined in the design doc in this file. If it has changed (or missing) it will be recreated.
  3. The example design docs (EXAMPLE1_DDOC and EXAMPLE2_DDOC) are simple design doc definitions as Javascript object. You’re familiar with CouchDB so this is self explanatory.
  4. Lastly the code just calls the update_views to update the design documents.

Now it’s possible to maintain the views in this Javascript file, the Node will make sure that the syntax is always valid.

Example output:

Views are up to date.

$ node cdb-views.js
"_design/example1" up to date
"_design/example2" up to date

Definition of view example2/myview has changed

$ node cdb-views.js
"_design/example1" up to date
definition of "myview" changed - updating "_design/example2"

Design doc example2 can not be found and is created.

$ node cdb-views.js
"_design/example1" up to date
no design doc found updating "_design/example2"