Commit df4679c5 authored by Tomas Krizek's avatar Tomas Krizek

msgdiff: use meta DB to create JSON report

parent fd3ded42
......@@ -80,11 +80,11 @@ class LMDB:
check_notexists: bool = False, drop: bool = False):
assert self.env is not None, "LMDB wasn't initialized!"
if not create and not self.exists_db(dbname):
msg = 'LMDB environment "{}" does not contain DB {}! '.format(
msg = 'LMDB environment "{}" does not contain DB "{}"! '.format(
self.path, dbname.decode('utf-8'))
raise RuntimeError(msg)
if check_notexists and self.exists_db(dbname):
msg = ('LMDB environment "{}" already contains DB {}! '
msg = ('LMDB environment "{}" already contains DB "{}"! '
'Overwritting it would invalidate data in the environment, '
'terminating.').format(self.path, dbname.decode('utf-8'))
raise RuntimeError(msg)
......@@ -118,7 +118,7 @@ class LMDB:
return self.dbs[dbname]
except KeyError:
raise RuntimeError("Database {} isn't open!".format(dbname.decode('utf-8')))
raise ValueError("Database {} isn't open!".format(dbname.decode('utf-8')))
def key_stream(self, dbname: bytes) -> Iterator[bytes]:
"""yield all keys from given db"""
......@@ -228,7 +228,10 @@ class Database(ABC):
if self.db is None:
if not self.DB_NAME:
raise RuntimeError('No database to initialize!')
self.lmdb.open_db(self.DB_NAME, create=True)
self.db = self.lmdb.get_db(self.DB_NAME)
except ValueError:
self.db = self.lmdb.open_db(self.DB_NAME, create=True)
with self.lmdb.env.begin(self.db, write=write) as txn:
yield txn
......@@ -4,6 +4,7 @@ import argparse
from functools import partial
import logging
import multiprocessing.pool as pool
import os
import pickle
from typing import Any, Dict, Iterator, Mapping, Optional, Sequence, Tuple # noqa
import sys
......@@ -232,8 +233,10 @@ def compare_lmdb_wrapper(criteria, target, qid):
txn.put(qid, blob)
def export_json(filename):
report = DiffReport.from_json(filename)
def export_json(filename: str, report: DiffReport):
if lmdb is None:
raise RuntimeError("LMDB wasn't initialized!")
report.other_disagreements = DisagreementsCounter()
report.target_disagreements = Disagreements()
......@@ -250,9 +253,34 @@ def export_json(filename):
for field, mismatch in diff.items():
report.target_disagreements.add_mismatch(field, mismatch, qid)
# it doesn't make sense to use existing report.json
if os.path.exists(filename):
backup_filename = filename + '.bak'
os.rename(filename, backup_filename)
'JSON report already exists, overwriting file. Original '
'file backed up as %s', backup_filename)
def prepare_report(lmdb_):
qdb = lmdb_.open_db(LMDB.QUERIES)
adb = lmdb_.open_db(LMDB.ANSWERS)
with lmdb_.env.begin() as txn:
total_queries = txn.stat(qdb)['entries']
total_answers = txn.stat(adb)['entries']
meta = MetaDatabase(lmdb_)
start_time = meta.read_start_time()
end_time = meta.read_end_time()
return DiffReport(
def main():
global lmdb
......@@ -264,12 +292,15 @@ def main():
args = parser.parse_args()
datafile = cli.get_datafile(args)
datafile = cli.get_datafile(args, check_exists=False)
criteria = args.cfg['diff']['criteria']
target = args.cfg['diff']['target']
# fast=True would later cause lmdb.BadRslotError in conjuction with multiprocessing
with LMDB(args.envdir) as lmdb_:
# NOTE: To avoid an lmdb.BadRslotError, probably caused by weird
# interaction when using multiple transaction / processes, open a separate
# environment. Also, any dbs have to be opened before using MetaDatabase().
report = prepare_report(lmdb_)
meta = MetaDatabase(lmdb_)
......@@ -287,7 +318,7 @@ def main():
with pool.Pool() as p:
for _ in p.imap_unordered(func, qid_stream, chunksize=10):
export_json(datafile, report)
if __name__ == '__main__':
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment