Commit ebce69dd authored by Edvard Rejthar's avatar Edvard Rejthar
Browse files

server development

Signed-off-by: Edvard Rejthar's avatarEdvard Rejthar <>
parent f48ea5fb
......@@ -10,7 +10,7 @@ apt install software-properties-common
add-apt-repository "deb $(lsb_release -sc) main universe restricted multiverse"
apt update
apt install firefox python3 mariadb-server xvfb
pip3 install xvfbwrapper pymysql peewee flask wtforms pyyaml bs4 pygments pillow requests
pip3 install xvfbwrapper pymysql peewee flask wtforms pyyaml bs4 pygments pillow requests humanize filelock
# current dir
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
......@@ -33,7 +33,7 @@ browser.runtime.onMessage.addListener(function (message) {
* traffic listener
let trafficBlacklist = ["", "", "", "", "", "", "http://localhost/redirect/"];
// let trafficBlacklist = ["", "", "", "", "", "", "http://localhost/redirect/"];
function (details) {
// log resources that we want
......@@ -104,7 +104,7 @@ function screenshot() {
} else {
screenshotted = true;
console.warn("Starting screenshot countdown...");
console.warn("MDMAUG: Starting screenshot countdown...");
setTimeout(function () {
// total scroll height of tab
let height = Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);
......@@ -2,32 +2,34 @@
import logging
import os
from flask import Flask, request
#logging.basicConfig(level=logging.DEBUG, format="%(message)s") # init before flask so that logging is shown in the Terminal
from flask import Flask
from xvfbwrapper import Xvfb
from .lib.config import Config
from .lib.controller.api import Api
# import ipdb; ipdb.set_trace()
logging.basicConfig(level=logging.DEBUG, format="%(message)s")
# assure the logging dir
if not os.path.exists(Config.LOG_DIR):
logger = logging.getLogger("mdmaug")
# server setup
# address = ''
app = Flask(__name__, static_url_path="/static")
app.secret_key = b'as8!r"afERaa5'
app.config["preferences"] = {
"safebrowsing": True,
"pdns": True,
"geoip": False
# app.config["preferences"] = {
# "safebrowsing": True,
# "pdns": True,
# "geoip": False
# }
httpd = HTTPServer((address, Config.APP_PORT), Server)
......@@ -45,8 +47,9 @@ try:
from .lib.controller.server import app as server
app.register_blueprint(server)'', ssl_context=(Config.DIR + 'cert-mdmaug.pem', Config.DIR + 'key-mdmaug.pem'), threaded=True)
# for _ in range(Config.profileCount):
if __name__ == "__main__":, port=Config.APP_PORT, ssl_context=(Config.DIR + 'cert-mdmaug.pem', Config.DIR + 'key-mdmaug.pem'), threaded=True)
# for _ in range(Config.profile_count):
# threading.Thread(target=httpd.serve_forever).start()
except (KeyboardInterrupt, SystemExit):
#!/usr/bin/env python3
# Browser extension can write to a file thanks to this.
import json
import os
import struct
import sys
# Read a message from stdin and decode it.
profile = os.environ["PROFILE"] if "PROFILE" in os.environ else "unknown-profile"
#log_dir = "/tmp/mdmaug/.cache/mdmaug-scans/_tmp/" # /tmp/ is small (200 MB) and takes precious RAM
cache_dir = os.environ["CACHE_DIR"] if "CACHE_DIR" in os.environ else "/tmp/"
files_encountered = set()
#cache_dir = log_dir # // default dir to store the analysis is log_dir if we fail to identify a better storage point
#with open(log_dir+"cache.dir", "r") as f:
# cache_dir =
def get_message():
raw_length =
if len(raw_length) == 0:
message_length = struct.unpack('@I', raw_length)[0]
message ='utf-8')
return json.loads(message)
return json.loads('utf-8'))
while True:
message = get_message()
file = cache_dir+message["filename"]
file = cache_dir + message["filename"]
if file in files_encountered:
method = "a"
method = "a"
method = "w"
with open(file, method) as f:
with open("/tmp/ram/zde.txt", "a") as f:
f.write(f"{file} {method}\n")
......@@ -2,15 +2,27 @@ import logging
import os
import threading
from flask import g
from peewee import MySQLDatabase
logger = logging.getLogger("mdmaug")
class Pref:
safebrowsing = True
pdns = True
geoip = False
def val2html(cls, param):
return "1" if getattr(cls, param) in [True, "true", "1"] else "0"
class Config:
profileCount = 21 # number of Firefox profiles. Its name is just a number – 0,1...
profile_count = 2 # number of Firefox profiles. Its name is just a number – 0,1...
browser = 'firefox' # iceweasel, firefox. What browser gets launched.
configFile = '/opt/mdmaug/.cache/mdmaug-scans/_tmp/queue.cache' # RAM disk was too small: '/tmp/mdm/queue.cache'
APP_PORT = 8000
config_file = '/opt/mdmaug/.cache/mdmaug-scans/_tmp/queue.cache' # RAM disk was too small: '/tmp/mdm/queue.cache'
APP_PORT = 5000
APP_IP = ""
APP_DOMAIN = '' + str(APP_PORT) #
LOG_DIR = "/opt/mdmaug/.cache/mdmaug-scans/_tmp/"
CACHE_DIR = "/opt/mdmaug/.cache/mdmaug-scans/"
......@@ -22,10 +34,11 @@ class Config:
MAX_BROWSER_RUN_TIME = 25 # maximum time for a browser to run
MAX_BROWSER_EXPIRATION = 15 # seconds that we wait before killing the browser (waiting for the files to be written)
def connect():
# XX resim problem peewee.OperationalError: (2006, "MySQL server has gone away (BrokenPipeError(32, 'Broken pipe'))") po 7 hodinach timeoutu
# XX kupodivu pripojeni nemuze byt v dbp DBModel.connect. Prestoze type je pak spravne (MySQLDatabase), nic udelat nejde a pokusy o select konci NoneType.
logging.debug("Connecting to DB.")
logger.debug("Connecting to DB.")
Config.myDB = MySQLDatabase("mdmaug", host='localhost', port=3306, user="mdmaug",
passwd="fidFDSs676") # XX dal jsem pryc: , threadlocals=False
......@@ -5,13 +5,14 @@ from urllib.parse import urlparse
from peewee import IntegrityError
from import Domains
from import domain2dir
from .scan_controller import ScanController
from ..config import Config
from ..model.dbp import Turris, Whitelist
from ..parser.traffic_log_parser import TrafficLogParser
from ...templates.crawl_view import CrawlView
logger = logging.getLogger("mdmaug")
class Api:
website = "" #
......@@ -19,7 +20,7 @@ class Api:
def run(self, request):
""" Accept command
:type path: dict from URL request. /api/analyze=cache/ → {"api": True, "analyze": cache, "page": ""}
:type request: dict from URL request. /api/analyze=cache/ → {"api": True, "analyze": cache, "page": ""}
crawl = None
......@@ -34,13 +35,13 @@ class Api:
elif "decide" in request: # XX deprecated?
return self.get_undecided()
elif "nicify" in request:
return TrafficLogParser.getStylesheet() + TrafficLogParser.nicifyFile(request["page"])
return TrafficLogParser.getStylesheet() + TrafficLogParser.nicify_file(request["page"])
elif "vote" in request: # /api/vote/block/
logging.debug("vote cmd")
logger.debug("vote cmd")
return["vote"], request["page"])
elif "scans" in request:
if "url" in request: # /api/scans/url/
domain = Domains.domain2dir(request["page"])
if "url" in request: # /api/scans/url/
domain = domain2dir(request["page"])
if not domain:
return "Wrong domain"
return ScanController().get_domain_snapdirs(domain, full_dirs=False)
......@@ -50,21 +51,23 @@ class Api:
"""url = path.split("/", 3)
if len(url) > 3:
self._setWebsite(url[2]) # osetrit, ze je URL, a nikoli shell
logging.debug("XXX nejsem si jist, zda url je spravne na url[2]") # XXX
logging.debug(url) # XXX
logger.debug("XXX nejsem si jist, zda url je spravne na url[2]") # XXX
logger.debug(url) # XXX
quit() # XXX
return self.whitelist()"""
return "Implement first if needed."
elif "reset" in request:
return "reset"
return "Unknown method."
return "Unknown API method."
if crawl:
if request["api"] == "json":
if type(crawl) is str: # probably an error
return crawl
elif request["api"] == "json":
return CrawlView.output_json(crawl)
elif request["api"] == "mdmaug":
return CrawlView.output_mdmaug(crawl)
......@@ -73,16 +76,16 @@ class Api:
def reset():
logging.debug("resetting running browsers")
with open(Config.configFile, 'w') as f: # clear the queue
logger.debug("resetting running browsers")
with open(Config.config_file, 'w') as f: # clear the queue
json.dump({}, f)["pkill", Config.browser]) # kill frozen browsers
# prida 2ld domenu mezi whitelistovane
def whitelist(self):
# Db.cur = Db.connection.cursor()
# self._logging.debug(Db.cur.execute("""REPLACE INTO whitelist set domain = %s""", (self.websiteDomain, )))
# self._logger.debug(Db.cur.execute("""REPLACE INTO whitelist set domain = %s""", (self.websiteDomain, )))
# Db.connection.commit()
# Db.cur.close()
......@@ -92,5 +95,5 @@ class Api:
def get_undecided():
logging.debug("XXX jeste jsem neudelal - ma vylezt tabulka vsech nerozhodlych domen od posledniho exportu")
logger.debug("XXX jeste jsem neudelal - ma vylezt tabulka vsech nerozhodlych domen od posledniho exportu")
......@@ -6,10 +6,14 @@ import subprocess
import time
import traceback
from glob import glob
from json import JSONDecodeError
from random import randint
from flask import escape
from filelock import FileLock
from ..config import Config
from import Domains
from import domain2dir, assure_url, url2domain
from ..model.crawl import Crawl
from ..parser.metadata_parser import MetadataParser
from ..parser.nspr_log_parser import NsprLogParser
......@@ -17,186 +21,198 @@ from ..parser.screenshot_parser import ScreenshotParser
from ..parser.spy_parser import SpyParser
from ..parser.traffic_log_parser import TrafficLogParser
logger = logging.getLogger("mdmaug")
class ScanController:
FF_INFO_FILE = "cache.dir"
CRAWL_FILE = "crawlSave.yaml"
profile = "-1" # bookovany profile firefoxu
queueFF = {}
profile = "-1" # booked browser profile
url = None
def get_domain_snapdirs(self, domain, full_dirs=True):
dir = Config.CACHE_DIR + domain + "/"
if os.path.isdir(dir):
return [str(dir + subdir) if full_dirs else str(subdir) for subdir in os.listdir(dir) # adresare vsech moznych snapshotu
if os.path.isdir(str(dir + subdir)) and os.path.isfile(dir + subdir + "/" + ScanController.CRAWL_FILE)]
def get_domain_snapdirs(domain, full_dirs=True):
d = Config.CACHE_DIR + domain + "/"
if os.path.isdir(d):
return [str(d + subdir) if full_dirs else str(subdir) for subdir in os.listdir(d) # all possible snapshot directories
if os.path.isdir(str(d + subdir)) and os.path.isfile(d + subdir + "/" + ScanController.CRAWL_FILE)]
def launch(self, url, cached=None):
:param url: scanned url
:type cached: True = Any cached version, int = cached version X days old. If None or not found, site will be reanalysed
if cached:
snapdirs = self.get_domain_snapdirs(Domains.domain2dir(url))
snapdirs = self.get_domain_snapdirs(domain2dir(url))
if snapdirs:
# get the most recent snapdir and check if it's not too old
cache_dir = max(snapdirs, key=os.path.getmtime) + "/"
if cached is True or os.path.getmtime(cache_dir) > time.time() - (3600 * 24 * cached):
logging.debug(f"Returning a previous crawl from: {cache_dir + ScanController.CRAWL_FILE}")
logger.debug(f"Returning a previous crawl from: {cache_dir + ScanController.CRAWL_FILE}")
crawl = Crawl.load_from_file(cache_dir + ScanController.CRAWL_FILE)
return crawl
except ValueError:
logging.debug("({-1}) Convenient cached analysis not found")
logger.debug("({-1}) Convenient cached analysis not found")
u = assure_url(url)
if not u:
return f'Invalid URL {escape(url)}'
self.url = u
# perform fresh analysis
if self.queue(): # /api/analyze/web - queue current analysis
print("({}) start crawl".format(self.profile))
self.url = Domains.assureUrl(url)
self.lock = FileLock(Config.config_file + ".lock")
if self.queue(url): # /api/analyze/web - queue current analysis
print(f"({self.profile}) start crawl")
# noinspection PyBroadException
crawl = self.analyze()
except Exception as e:
logging.debug("PROFILE EXCEPTION {}".format(self.profile))
logger.debug(f"({self.profile}) PROFILE EXCEPTION")
# XX Pokud je potiz, ze JS zabiji FF, mozno experimentovat s ulimit -Sv 500000;
return ("PROFILE EXCEPTION ({}) {} See logs, i.e. mdmaug/nohup.out. ".format(self.profile, e))
return f"PROFILE EXCEPTION ({self.profile}) {e} See logs, i.e. mdmaug/nohup.out. "
crawl.save_to_file(crawl.cacheDir + ScanController.CRAWL_FILE) # ulozit vysledky hledani
crawl.save_to_file(crawl.cacheDir + ScanController.CRAWL_FILE) # save search results
return crawl
else: # analyza se nepodarilo si zabookovat FF profil
logging.debug("no free slots")
result = "failed - no free slots. <a href='" + Config.APP_DOMAIN + "/reset'>Reset</a>" # volny profil jsme nenasli
return "<div id='analysis-results'>{}</div>".format(result)
logger.debug("(-) no free slots")
result = f"Scanning {self.url} failed no free slots. <a href='{Config.APP_DOMAIN}/reset'>Reset</a>"
return f"<div id='analysis-results'>{result}</div>"
def analyze(self):
# spustit firefox pod profilem
print("({}) browser launch".format(self.profile))
print(f"({self.profile}) browser launch")
logDir, cacheDir = self.assureDirs() # pripravit log a cache adresar
log_dir, cache_dir = self.assure_dirs() # prepare log & cache directories
logfile = logDir + "log{}.log".format(self.profile)
logfile = log_dir + "log{}.log".format(self.profile)
# max_time = 3 # XXX
# ,nsSocketTransport:5,nsStreamPump:5,nsHostResolver:5
logging.debug("({}) FF -P {} -no-remote {}".format(self.profile, self.profile, self.url))
command = "export NSPR_LOG_MODULES=timestamp,nsHttp:5 ; export NSPR_LOG_FILE={} ; export CACHE_DIR={}; export PROFILE={};{} -P {} -no-remote '{}'".format(
logfile, cacheDir, self.profile, Config.browser, self.profile,
"http://localhost/redirect/" + self.url) # http://localhost/redirect/ gets stripped by the extension
# terminate Config.browser if hes not able to (everything has to be in single command because there is no heritance of $! amongst subprocesses)
command += " & echo $!;ii=0; while [ -n \"`ps -p $! | grep {}`\" ];do echo \"({}) running\" ;ii=$((ii+1)); if [ $ii -gt {} ]; then echo '({}) kill';kill $!; break;fi; sleep 1; done".format(
Config.browser, self.profile, Config.MAX_BROWSER_RUN_TIME,
self.profile) # (pokud bezi proces $! (posledni backgroudovany process), spi 1 s)
# > /dev/null
logger.debug("({}) FF -P {} -no-remote {}".format(self.profile, self.profile, self.url))
# http://localhost/redirect/ gets stripped by the extension
command = f"export NSPR_LOG_MODULES=timestamp,nsHttp:5 ; export NSPR_LOG_FILE={logfile} ;" \
f" export CACHE_DIR={cache_dir}; export PROFILE={self.profile};" \
f"{Config.browser} -P {self.profile} -no-remote 'http://localhost/redirect/{self.url}'"
# terminate Config.browser if he's not able to
# (everything has to be in single command because there is no heritance of $! amongst subprocesses)
command += f" & echo $!;ii=0; while [ -n \"`ps -p $! | grep {Config.browser}`\" ];" \
f"do echo \"({self.profile}) running\" ;ii=$((ii+1)); if [ $ii -gt {Config.MAX_BROWSER_RUN_TIME} ];" \
f" then echo '({self.profile}) kill';kill $!; break;fi; sleep 1; done" # > /dev/null
logger.debug(command)[command], shell=True)
logging.debug("({}) stopped!".format(self.profile))
logger.debug(f"({self.profile}) stopped!")
# shromazdit informace z analyz
crawl = Crawl(host=self.url, log_dir=logDir, cache_dir=cacheDir)
crawl = Crawl(host=self.url, log_dir=log_dir, cache_dir=cache_dir, profile=self.profile)
expiration = 0
while not os.path.isfile(logfile): # i po zavreni FF nekdy trva, nez se soubor zapise
expiration += 1
logging.debug("({}) waiting to close...".format(self.profile))
logger.debug(f"({self.profile}) waiting to close...")
if expiration > Config.MAX_BROWSER_EXPIRATION:
logging.debug("({}) time is run!".format(self.profile))
logger.debug(f"({self.profile}) time is run!")
raise FileNotFoundError("time is run - browser expired")
NsprLogParser(logfile, crawl)
self.unbookProfile() # uvolnit browser profil
TrafficLogParser(crawl) # obohatit crawl vysledky o analyzu z browseru
MetadataParser(crawl, Domains.url2domain(self.url)) # cekame na whois servery - az po uvolneni profilu
MetadataParser(crawl, url2domain(self.url))
print("({}) thread parsers ends".format(self.profile))
return crawl
def _getCacheDirStamp():
# pro archiv logu pouzit timestamp: #return "current"
def _get_cache_dir_stamp():
return datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d%H%M%S')
def _assureDir(dirName):
""" Adresar vytvori, nebo promaze. Aby byl cisty k pouziti. """
if not os.path.exists(dirName):
for file in glob(dirName + "*"): # log dir promazat - # pokud byl, smaze jeho obsah z minula (stara analyza)
def _assure_dir(dir_name):
""" Dir is created or cleaned up to be ready for use. """
if not os.path.exists(dir_name):
for file in glob(dir_name + "*"): # log dir promazat - # pokud byl, smaze jeho obsah z minula (stara analyza)
return dirName
return dir_name
def assureDirs(self):
def assure_dirs(self):
""" Vytvori adresar logu a cache, pokud nejsou
Cache ex: /home/mdmaug/.cache/mdmaug-scans/ - sem nageneruje xpi logy
logDir = ScanController._assureDir(Config.LOG_DIR + str(self.profile) + "-log/")
cacheDir = ScanController._assureDir(
Config.CACHE_DIR + Domains.domain2dir(self.url) + "/" + ScanController._getCacheDirStamp() + "/")
log_dir = ScanController._assure_dir(Config.LOG_DIR + str(self.profile) + "-log/")
cache_dir = ScanController._assure_dir(
Config.CACHE_DIR + domain2dir(self.url) + "/" + ScanController._get_cache_dir_stamp() + "/")
# info pro FF
with open(logDir + ScanController.FF_INFO_FILE, "w") as f: # v logDiru mu dame odkaz do cacheDiru
f.write(cacheDir) # napoveda, kde FF najde cache dir (protoze FF najde log dir podle nazvu profilu)
with open(log_dir + ScanController.FF_INFO_FILE, "w") as f: # v logDiru mu dame odkaz do cacheDiru
f.write(cache_dir) # napoveda, kde FF najde cache dir (protoze FF najde log dir podle nazvu profilu)
return logDir, cacheDir
return log_dir, cache_dir
def _loadProfileQueue(self):
# load queue from config file
def _load_profile_queue(self):
# load queue from config file
with open(Config.configFile, 'r') as f:
self.queueFF = json.load(f)
with open(Config.configFile, 'w') as f:
with open(Config.config_file, 'r') as f:
queue = json.load(f)
print(f"*** Loaded {self.profile} {queue}")
except (IOError, JSONDecodeError):
with open(Config.config_file, 'w'):
self.queueFF = {}
def bookProfile(self):
# zabookovat profil firefoxu
self.queueFF[self.profile] = "loading"
with open(Config.configFile, 'w') as f:
json.dump(self.queueFF, f)
queue = {}
return queue
def unbookProfile(self):
def dump():
with open(Config.configFile, 'w') as f:
json.dump(self.queueFF, f)
def _save_profile_queue(self, queue):
with open(Config.config_file, 'w') as f:
print(f"*** Saving {self.profile} {queue}")
json.dump(queue, f)
# logging.debug("UNKBOOK")
except KeyError:
logging.debug("Unbook failed")
except OSError:
"({}) OS Error - interferuje s pustenym FF, ktere zere prilis pameti. Zkusime pockat.".format(self.profile))
time.sleep(10) # XX jestli funkcionalitu zachovat, dat sem pocitadlo, at je na konzoli videt akce
def dequeue(self):
with self.lock:
queue = self._load_profile_queue()
except KeyError:
logger.debug("Unbook failed")
except OSError:
logging.debug("({}) System se nezotavil.".format(self.profile))
return "Memory may be exhausted. See mdmaug-server/ for details." # FF sezral vsechnu pamet asi. Stranka je problematicka. UrlQuery podle me taky selze.
# logging.debug("UNKBOOKED")
def queue(self):
""" Reads from queue.cache what profile is available and books it """
logger.debug(f"({self.profile}) OS Error - interference with a running browser consuming too much memory. "
f"Let's wait 10 s.")
except OSError:
logger.debug(f"({self.profile}) System didn't recover.")
return "Memory may be exhausted. See mdmaug-server/ for details."
# FF used up all the memory. URL is problematic. In my opinion, UrlQuery would fail too.
def queue(self, url):
""" Reads from queue.cache what profile is available and books it
:return: Have we succeeded to book a browser profile?
self.profile = -1
for _ in range(4): # na volny slot zkusime nekolikrat pockat
for i in range(Config.profileCount): # i = 10 if i ==10:
if self.queueFF.get(str(i)) == None:
self.profile = i
for _ in range(4): # wait for a free slot several times
with self.lock:
queue = self._load_profile_queue()
for i in range(Config.profile_count):
if queue.get(str(i)) is None:
self.profile = i
queue[self.profile] = url # X"loading"
return True # we found a free slot, let's proceed
if self.profile == -1:
logging.debug("(-1) FULL, let's wait few secs")
time.sleep(randint(5, 10)) # pockame par vterin