2 * LMDBBackend - a high performance LMDB based backend for PowerDNS written by
5 * This was originally going to be a backend using BerkeleyDB 5 for high
6 * performance DNS over massive (millions of zones) databases. However,
7 * BerkeleyDB had a number of issues to do with locking, contention and
8 * corruption which made it unsuitable for use. Instead, we use LMDB to perform
11 * See the documentation for more details, and lmdb-example.pl for an example
12 * script which generates a simple zone.
15 #include "pdns/utility.hh"
16 #include "pdns/dnsbackend.hh"
17 #include "pdns/dns.hh"
18 #include "pdns/dnspacket.hh"
19 #include "pdns/pdnsexception.hh"
20 #include "pdns/logger.hh"
22 #include "lmdbbackend.hh"
23 #include "pdns/arguments.hh"
24 #include "pdns/base32.hh"
25 #include "pdns/lock.hh"
28 #define DEBUGLOG(msg) L<<Logger::Error<<msg
30 #define DEBUGLOG(msg) do {} while(0)
33 int LMDBBackend::s_reloadcount
=0;
34 pthread_mutex_t
LMDBBackend::s_initlock
= PTHREAD_MUTEX_INITIALIZER
;
36 LMDBBackend::LMDBBackend(const string
&suffix
)
38 setArgPrefix("lmdb"+suffix
);
40 d_doDnssec
= mustDo("experimental-dnssec");
42 catch (ArgException e
) {
45 d_lastreload
= s_reloadcount
;
49 void LMDBBackend::open_db() {
50 L
<<Logger::Error
<<"Loading LMDB database " << getArg("datapath") << endl
;
52 string path
= getArg("datapath");
54 int major
, minor
, patch
;
56 string
verstring( mdb_version( &major
, &minor
, &patch
) );
57 if( MDB_VERINT( major
, minor
, patch
) < MDB_VERINT( 0, 9, 8 ) )
58 throw PDNSException( "LMDB Library version too old (" + verstring
+ "). Needs to be 0.9.8 or greater" );
62 if( (rc
= mdb_env_create(&env
)) )
63 throw PDNSException("Couldn't open LMDB database " + path
+ ": mdb_env_create() returned " + mdb_strerror(rc
));
65 if( (rc
= mdb_env_set_maxdbs( env
, d_doDnssec
? 5 : 3)) )
66 throw PDNSException("Couldn't open LMDB database " + path
+ ": mdb_env_set_maxdbs() returned " + mdb_strerror(rc
));
68 if( (rc
= mdb_env_open(env
, path
.c_str(), MDB_RDONLY
, 0)) )
69 throw PDNSException("Couldn't open LMDB database " + path
+ ": mdb_env_open() returned " + mdb_strerror(rc
));
71 if( (rc
= mdb_txn_begin(env
, NULL
, MDB_RDONLY
, &txn
) ))
72 throw PDNSException("Couldn't start LMDB txn " + path
+ ": mdb_txn_begin() returned " + mdb_strerror(rc
));
74 if( (rc
= mdb_dbi_open(txn
, "zone", 0, &zone_db
) ) )
75 throw PDNSException("Couldn't open LMDB zone database " + path
+ ": mdb_dbi_open() returned " + mdb_strerror(rc
));
76 if( (rc
= mdb_cursor_open(txn
, zone_db
, &zone_cursor
) ))
77 throw PDNSException("Couldn't open cursor on LMDB zone database " + path
+ ": mdb_cursor_open() returned " + mdb_strerror(rc
));
79 if( (rc
= mdb_dbi_open(txn
, "data", MDB_DUPSORT
, &data_db
) ))
80 throw PDNSException("Couldn't open LMDB data database " + path
+ ": mdb_dbi_open() returned " + mdb_strerror(rc
));
81 if( (rc
= mdb_cursor_open(txn
, data_db
, &data_cursor
) ))
82 throw PDNSException("Couldn't open cursor on LMDB data database " + path
+ ": mdb_cursor_open() returned " + mdb_strerror(rc
));
84 if( (rc
= mdb_dbi_open(txn
, "extended_data", 0, &data_extended_db
) ))
85 throw PDNSException("Couldn't open LMDB extended_data database " + path
+ ": mdb_dbi_open() returned " + mdb_strerror(rc
));
86 if( ( rc
= mdb_cursor_open(txn
, data_extended_db
, &data_extended_cursor
)) )
87 throw PDNSException("Couldn't open cursor on LMDB data_extended database " + path
+ ": mdb_cursor_open() returned " + mdb_strerror(rc
));
90 DEBUGLOG("Experimental dnssec support enabled"<<endl
);
91 if( (rc
= mdb_dbi_open(txn
, "rrsig", MDB_DUPSORT
, &rrsig_db
) ))
92 throw PDNSException("Couldn't open LMDB rrsig database " + path
+ ": mdb_dbi_open() returned " + mdb_strerror(rc
));
93 if( ( rc
= mdb_cursor_open(txn
, rrsig_db
, &rrsig_cursor
)) )
94 throw PDNSException("Couldn't open cursor on LMDB rrsig database " + path
+ ": mdb_cursor_open() returned " + mdb_strerror(rc
));
96 if( (rc
= mdb_dbi_open(txn
, "nsecx", 0, &nsecx_db
) ))
97 throw PDNSException("Couldn't open LMDB nsecx database " + path
+ ": mdb_dbi_open() returned " + mdb_strerror(rc
));
98 if( ( rc
= mdb_cursor_open(txn
, nsecx_db
, &nsecx_cursor
)) )
99 throw PDNSException("Couldn't open cursor on LMDB nsecx database " + path
+ ": mdb_cursor_open() returned " + mdb_strerror(rc
));
103 void LMDBBackend::close_db() {
104 L
<<Logger::Error
<<"Closing LMDB database"<< endl
;
106 mdb_cursor_close(data_cursor
);
107 mdb_cursor_close(zone_cursor
);
108 mdb_cursor_close(data_extended_cursor
);
109 mdb_dbi_close(env
, data_db
);
110 mdb_dbi_close(env
, zone_db
);
111 mdb_dbi_close(env
, data_extended_db
);
113 mdb_cursor_close(rrsig_cursor
);
114 mdb_cursor_close(nsecx_cursor
);
115 mdb_dbi_close(env
, rrsig_db
);
116 mdb_dbi_close(env
, nsecx_db
);
122 LMDBBackend::~LMDBBackend()
127 void LMDBBackend::reload() {
131 void LMDBBackend::needReload() {
132 if (s_reloadcount
> d_lastreload
) {
133 d_lastreload
= s_reloadcount
;
139 bool LMDBBackend::getDomainMetadata(const string
& name
, const std::string
& kind
, std::vector
<std::string
>& meta
)
146 if (kind
== "PRESIGNED" || kind
== "NSEC3PARAM") {
149 string key_str
, cur_value
;
150 vector
<string
> valparts
;
152 key_str
=bitFlip(labelReverse(toLower(name
)))+"\xff";
153 key
.mv_data
= (char *)key_str
.c_str();
154 key
.mv_size
= key_str
.length();
156 if ((rc
= mdb_cursor_get(zone_cursor
, &key
, &data
, MDB_SET_KEY
)) == 0) {
157 cur_value
.assign((const char *)data
.mv_data
, data
.mv_size
);
158 stringtok(valparts
,cur_value
,"\t");
160 if (valparts
.size() == 4) {
161 if (kind
== "PRESIGNED")
163 else if (valparts
[3] != "1")
164 meta
.push_back(valparts
[3]);
168 if (rc
== MDB_NOTFOUND
)
169 DEBUGLOG("Metadata records for zone: '"<<name
<<"'' not found. This is impossible !!!"<<endl
);
175 bool LMDBBackend::getDirectNSECx(uint32_t id
, const string
&hashed
, const QType
&qtype
, string
&before
, DNSResourceRecord
&rr
)
183 string key_str
, cur_key
, cur_value
;
184 vector
<string
> keyparts
, valparts
;
186 if (qtype
== QType::NSEC
)
187 key_str
=itoa(id
)+"\t"+bitFlip(hashed
)+"\xff";
189 key_str
=itoa(id
)+"\t"+toBase32Hex(bitFlip(hashed
));
190 key
.mv_data
= (char *)key_str
.c_str();
191 key
.mv_size
= key_str
.length();
194 if(!mdb_cursor_get(nsecx_cursor
, &key
, &data
, MDB_SET_RANGE
)) {
195 cur_key
.assign((const char *)key
.mv_data
, key
.mv_size
);
196 cur_value
.assign((const char *)data
.mv_data
, data
.mv_size
);
197 stringtok(keyparts
,cur_key
,"\t");
198 stringtok(valparts
,cur_value
,"\t");
200 if( keyparts
.size() != 2 || valparts
.size() != 4 ) {
201 throw PDNSException("Invalid record in nsecx table: key: '" + cur_key
+ "'; value: "+ cur_value
);
204 // is the key a full match or does the id part match our zone?
205 // if it does we have a valid answer.
206 if (!key_str
.compare(cur_key
) || atoi(keyparts
[0].c_str()) == (int) id
) // FIXME we need atoui
209 // no match, now we look for the last record in the NSECx chain.
210 key_str
=itoa(id
)+"\t";
211 key
.mv_data
= (char *)key_str
.c_str();
212 key
.mv_size
= key_str
.length();
214 if(!mdb_cursor_get(nsecx_cursor
, &key
, &data
, MDB_NEXT_NODUP
)) {
215 cur_key
.assign((const char *)key
.mv_data
, key
.mv_size
);
216 cur_value
.assign((const char *)data
.mv_data
, data
.mv_size
);
217 stringtok(keyparts
,cur_key
,"\t");
218 stringtok(valparts
,cur_value
,"\t");
220 if( keyparts
.size() != 2 || valparts
.size() != 4 ) {
221 throw PDNSException("Invalid record in nsecx table: key: '" + cur_key
+ "'; value: "+ cur_value
);
224 if (!key_str
.compare(cur_key
) || atoi(keyparts
[0].c_str()) == (int) id
) // FIXME we need atoui
228 DEBUGLOG("NSECx record for '"<<toBase32Hex(bitFlip(hashed
))<<"'' in zone '"<<id
<<"' not found"<<endl
);
232 if (qtype
== QType::NSEC
)
233 before
=bitFlip(keyparts
[1]).c_str();
235 before
=bitFlip(fromBase32Hex(keyparts
[1]));
236 rr
.qname
=valparts
[0];
237 rr
.ttl
=atoi(valparts
[1].c_str());
238 rr
.qtype
=DNSRecordContent::TypeToNumber(valparts
[2]);
239 rr
.content
=valparts
[3];
240 rr
.d_place
=DNSResourceRecord::AUTHORITY
;
247 bool LMDBBackend::getDirectRRSIGs(const string
&signer
, const string
&qname
, const QType
&qtype
, vector
<DNSResourceRecord
> &rrsigs
)
256 string key_str
, cur_value
;
257 vector
<string
> valparts
;
259 key_str
=signer
+"\t"+makeRelative(qname
, signer
)+"\t"+qtype
.getName();
260 key
.mv_data
= (char *)key_str
.c_str();
261 key
.mv_size
= key_str
.length();
263 if ((rc
= mdb_cursor_get(rrsig_cursor
, &key
, &data
, MDB_SET_KEY
)) == 0) {
264 DNSResourceRecord rr
;
266 rr
.qtype
=QType::RRSIG
;
267 //rr.d_place = (DNSResourceRecord::Place) signPlace;
271 cur_value
.assign((const char *)data
.mv_data
, data
.mv_size
);
272 stringtok(valparts
,cur_value
,"\t");
274 if( valparts
.size() != 2 ) {
275 throw PDNSException("Invalid record in rrsig table: qname: '" + qname
+ "'; value: "+ cur_value
);
278 rr
.ttl
=atoi(valparts
[0].c_str());
279 rr
.content
= valparts
[1];
280 rrsigs
.push_back(rr
);
282 } while (mdb_cursor_get(rrsig_cursor
, &key
, &data
, MDB_NEXT_DUP
) == 0);
285 if (rc
== MDB_NOTFOUND
)
286 DEBUGLOG("RRSIG records for qname: '"<<qname
<<"'' with type: '"<<qtype
.getName()<<"' not found"<<endl
);
291 // Get the zone name of the requested zone (labelReversed) OR the name of the closest parrent zone
292 bool LMDBBackend::getAuthZone( string
&rev_zone
)
297 // XXX can do this just using char *
299 string key_str
=bitFlip(rev_zone
+" ");
300 key
.mv_data
= (char *)key_str
.c_str();
301 key
.mv_size
= key_str
.length();
303 // Release our transaction and cursors in order to get latest data
304 mdb_txn_reset( txn
);
305 mdb_txn_renew( txn
);
306 mdb_cursor_renew( txn
, zone_cursor
);
307 mdb_cursor_renew( txn
, data_cursor
);
308 mdb_cursor_renew( txn
, data_extended_cursor
);
310 mdb_cursor_renew( txn
, rrsig_cursor
);
311 mdb_cursor_renew( txn
, nsecx_cursor
);
314 // Find the best record
315 if( mdb_cursor_get( zone_cursor
, &key
, &data
, MDB_SET_RANGE
) == 0 && key
.mv_size
<= key_str
.length() ) {
316 // Found a shorter match. Now look if the zones are equal up to key-length-1. If they are check
317 // if position key-length in key_str is a label separator. If all this is true we have a match.
318 if( key_str
.compare( 0, key
.mv_size
-1, (const char *) key
.mv_data
, key
.mv_size
-1 ) == 0 && key
.mv_size
&& key_str
[key
.mv_size
-1] == ~' ') {
319 rev_zone
.resize( key
.mv_size
-1 );
321 DEBUGLOG("Auth key: " << rev_zone
<<endl
);
327 //reset the cursor the data in it is invallid
328 mdb_cursor_renew( txn
, zone_cursor
);
333 bool LMDBBackend::getAuthData( SOAData
&soa
, DNSPacket
*p
)
338 if( mdb_cursor_get(zone_cursor
, &key
, &value
, MDB_GET_CURRENT
) )
341 string
data( (const char *)value
.mv_data
, value
.mv_size
);
342 DEBUGLOG("Auth record data " << data
<<endl
);
344 // XXX do this in C too
346 stringtok(parts
,data
,"\t");
349 throw PDNSException("Invalid record in zone table: " + data
);
351 fillSOAData( parts
[2], soa
);
353 soa
.domain_id
= atoi( parts
[0].c_str() );
354 soa
.ttl
= atoi( parts
[1].c_str() );
362 // Called to start an AXFR then ->get() is called. Return true if the domain exists
363 bool LMDBBackend::list(const string
&target
, int zoneId
, bool include_disabled
) {
364 DEBUGLOG("list() requested for " <<target
<< endl
);
366 d_origdomain
= target
;
367 d_domain_id
= zoneId
;
368 d_curqtype
= QType::AXFR
;
370 // getSOA will have been called first to ensure the domain exists so if
371 // that's the case then there's no reason we can't AXFR it.
376 void LMDBBackend::lookup(const QType
&type
, const string
&inQdomain
, DNSPacket
*p
, int zoneId
)
378 DEBUGLOG("lookup: " <<inQdomain
<< " " << type
.getName() << endl
);
383 d_origdomain
= inQdomain
;
387 inline bool LMDBBackend::get_finished()
394 bool LMDBBackend::get(DNSResourceRecord
&rr
)
397 bool is_axfr
= (d_curqtype
== QType::AXFR
);
398 bool is_full_key
= ( ! is_axfr
&& d_curqtype
!= QType::ANY
);
400 DEBUGLOG("get : " <<d_origdomain
<< endl
);
401 if( !d_origdomain
.length() )
404 DEBUGLOG("Starting Q " << d_first
<< endl
);
409 // Reverse the query string
410 string lowerq
= toLower( d_origdomain
);
411 d_querykey
= string( lowerq
.rbegin(), lowerq
.rend() );
412 d_searchkey
= d_querykey
;
414 // For normal queries ensure that we are only trying to get the exact
415 // record and also try to specify the type too to make negatives a lot
420 // Search by query type too to easily exclude anything that doesn't
423 d_searchkey
+= d_curqtype
.getName();
426 key
.mv_size
= d_searchkey
.length();
427 key
.mv_data
= (char *)d_searchkey
.c_str();
428 if( mdb_cursor_get(data_cursor
, &key
, &value
, is_full_key
? MDB_SET_KEY
: MDB_SET_RANGE
) )
429 return get_finished();
433 if( mdb_cursor_get(data_cursor
, &key
, &value
, is_full_key
? MDB_NEXT_DUP
: MDB_NEXT
) )
434 return get_finished();
437 // Some buggy versions of lmdb will do this. Should be caught in opendb above though.
438 if( key
.mv_size
== 0 ) {
439 DEBUGLOG("No key returned. Error" << endl
);
440 return get_finished();
443 string
cur_value((const char *)value
.mv_data
, value
.mv_size
);
444 string
cur_key((const char *)key
.mv_data
, key
.mv_size
);
446 DEBUGLOG("querykey: " << d_querykey
<< "; cur_key: " <<cur_key
<< "; cur_value: '" << cur_value
<< "'" << endl
);
448 vector
<string
> keyparts
, valparts
;
450 stringtok(keyparts
,cur_key
,"\t");
451 stringtok(valparts
,cur_value
,"\t");
453 if( valparts
.size() == 2 && valparts
[0] == "REF" ) {
454 MDB_val extended_key
, extended_val
;
456 // XXX parse into an int and have extended table as MDB_INTEGER to have
457 // a bit better performance/smaller space?
458 extended_key
.mv_data
= (char *)valparts
[1].c_str();
459 extended_key
.mv_size
= valparts
[1].length();
461 if( int rc
= mdb_cursor_get( data_extended_cursor
, &extended_key
, &extended_val
, MDB_SET_KEY
) )
462 throw PDNSException("Record " + cur_key
+ " references extended record " + cur_value
+ " but this doesn't exist: " + mdb_strerror( rc
));
464 cur_value
.assign((const char *)extended_val
.mv_data
, extended_val
.mv_size
);
466 stringtok(valparts
, cur_value
, "\t");
469 if (valparts
.size() != 3) // FIXME
470 valparts
.push_back(".");
472 if( keyparts
.size() != 2 || valparts
.size() != 3 )
473 throw PDNSException("Invalid record in record table: key: '" + cur_key
+ "'; value: '"+ cur_value
+"'");
475 string compare_string
= cur_key
.substr(0, d_searchkey
.length());
476 DEBUGLOG( "searchkey: " << d_searchkey
<< "; compare: " << compare_string
<< ";" << endl
);
478 // If we're onto records not beginning with this search prefix, then we
479 // must be past the end
480 if( compare_string
.compare( d_searchkey
) )
481 return get_finished();
483 int domain_id
= atoi( valparts
[0].c_str() );
485 // If we are doing an AXFR and the record fetched has been outside of our domain then end the transfer
487 // Check it's not a subdomain ie belongs to this record
488 if( domain_id
!= d_domain_id
)
491 // If it's under the main domain then append the . to the comparison to
492 // ensure items outside our zone don't enter
493 if( keyparts
[0].length() > d_querykey
.length() ) {
494 string test
= d_querykey
;
497 compare_string
= cur_key
.substr(0, d_querykey
.length() + 1);
499 DEBUGLOG("test: " << test
<< "; compare: " << compare_string
<< ";" << endl
);
501 if( test
.compare( compare_string
) )
505 // We need to maintain query casing so strip off domain (less dot) and append originial query
506 string sub
= keyparts
[0].substr( d_origdomain
.length(), string::npos
);
507 rr
.qname
= string( sub
.rbegin(), sub
.rend() ) + d_origdomain
;
509 rr
.qname
= d_origdomain
; // use cached and original casing
511 DEBUGLOG("Found record: " <<cur_key
<< ": "<<valparts
.size() << endl
);
513 DEBUGLOG("pass! " << rr
.qname
<< ";" << endl
);
514 rr
.qtype
= keyparts
[1];
516 /* Filter records to only match query type */
517 if( d_curqtype
!= QType::ANY
&& !is_axfr
&& rr
.qtype
!= d_curqtype
)
520 DEBUGLOG("Correct record type" << endl
);
523 rr
.domain_id
= domain_id
;
524 rr
.ttl
= atoi( valparts
[1].c_str() );
525 rr
.content
= valparts
[2];
530 class LMDBFactory
: public BackendFactory
533 LMDBFactory() : BackendFactory("lmdb") {}
534 void declareArguments(const string
&suffix
="")
536 declare(suffix
,"datapath","Path to the directory containing the lmdb files","/etc/pdns/data");
537 declare(suffix
,"experimental-dnssec","Enable experimental DNSSEC processing","no");
539 DNSBackend
*make(const string
&suffix
="")
541 return new LMDBBackend(suffix
);
552 BackendMakers().report(new LMDBFactory
);
553 L
<< Logger::Info
<< "[lmdbbackend] This is the lmdb backend version " VERSION
" (" __DATE__
", " __TIME__
") reporting" << endl
;
557 static LMDBLoader lmdbLoader
;