Verified Commit e0a289e3 authored by Martin Petráček's avatar Martin Petráček
Browse files

domains replace: more robust config file format

parent 69bea908
"facebook.com": "facebook.com"
"fbstatic-a.akamaihd.net": "facebook.com"
".fbcdn.net": "facebook.com"
"fbcdn-": "facebook.com"
".facebook.net": "facebook.com"
"youtube.": "youtube.com"
"youtu.be.": "youtube.com"
"yt3.ggpht.com": "youtube.com"
".googlevideo.com": "youtube.com"
".ytimg.com": "youtube.com"
"youtube-nocookie.": "youtube.com"
".google.": "google.com"
".gstatic.com": "google.com"
".googlesyndication.com": "google.com"
".googletagservices.com": "google.com"
".2mdn.net": "google.com"
".doubleclick.net": "google.com"
"googleads.": "google.com"
"google-analytics.": "google.com"
"googleusercontent.": "google.com"
"googleadservices.": "google.com"
"googleapis.com": "google.com"
{
"facebook.com": "facebook.com",
"fbstatic-a.akamaihd.net": "facebook.com",
".fbcdn.net": "facebook.com",
"fbcdn-": "facebook.com",
".facebook.net": "facebook.com",
"youtube.": "youtube.com",
"youtu.be.": "youtube.com",
"yt3.ggpht.com": "youtube.com",
".googlevideo.com": "youtube.com",
".ytimg.com": "youtube.com",
"youtube-nocookie.": "youtube.com",
".google.": "google.com",
".gstatic.com": "google.com",
".googlesyndication.com": "google.com",
".googletagservices.com": "google.com",
".2mdn.net": "google.com",
".doubleclick.net": "google.com",
"googleads.": "google.com",
"google-analytics.": "google.com",
"googleusercontent.": "google.com",
"googleadservices.": "google.com",
"googleapis.com": "google.com",
}
......@@ -13,32 +13,52 @@ import json
interval = 3600
def multiple_replace(text):
def one_xlat(match):
return multiple_replace.adict[match.group(1)]
return multiple_replace.rx.sub(one_xlat, text)
class MultiReplace:
"perform replacements specified by regex and adict all at once"
" The regex is constructed such that it matches the whole string (.* in the beginnin and end),"
" the actual key from adict is the first group of match (ignoring possible prefix and suffix)."
" The whole string is then replaced (the replacement is specified by adict)"
def __init__(self, adict):
self.adict = adict
self.rx = re.compile("^.*("+'|'.join(map(re.escape, adict))+").*$")
def replace(self, text):
def one_xlat(match):
return self.adict[match.group(1)]
return self.rx.sub(one_xlat, text)
con = sqlite3.connect('/var/lib/pakon.db')
c = con.cursor()
adict={}
try:
data_file = open('/usr/share/pakon-light/domains_replace.json')
adict = json.load(data_file)
with open('/usr/share/pakon-light/domains_replace.conf') as f:
for line in f:
match = re.match('\s*"([^"]+)"\s*:\s*"([^"]+)"\s*', line)
if not match:
if re.match('\s*', line): #ignore empty lines
continue
print("invalid line: "+line)
continue
adict[match.group(1)]=match.group(2)
except IOError:
print("can't load domains_services file")
sys.exit(1)
multiple_replace.adict = adict
multiple_replace.rx = re.compile("^.*("+'|'.join(map(re.escape, adict))+").*$")
if not adict:
print("empty dictionary of replacements, nothing to do")
sys.exit(1)
con = sqlite3.connect('/var/lib/pakon.db')
c = con.cursor()
multiple_replace = MultiReplace(adict)
now = int(time.mktime(datetime.datetime.utcnow().timetuple()))
start = now-interval*2
replaced = 0
for row in c.execute('SELECT DISTINCT(app_hostname) FROM traffic WHERE start >= ? AND app_hostname IS NOT NULL AND flow_id IS NULL', (start,)):
name = multiple_replace(row[0])
name = multiple_replace.replace(row[0])
if name!=row[0]:
t = con.cursor()
t.execute("UPDATE traffic SET app_hostname = ? WHERE app_hostname = ? AND flow_id IS NULL", (name, row[0]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment