dnscol
1. Generate csv (or csv.gz) files via ./dnscol -C dnscol.conf /srv/andraste/*
2. Create impala table for CSV imports (once)
create table dnscol.csv_import (
time DOUBLE, -- convert to timestamp
delay_us INT,
req_dns_len INT,
resp_dns_len INT,
req_net_len INT,
resp_net_len INT,
client_addr STRING,
client_port INT,
server_addr STRING,
server_port INT,
net_proto INT,
net_ipv INT,
net_ttl INT,
req_udp_sum INT,
id INT,
qtype INT,
qclass INT,
opcode INT,
rcode INT,
resp_aa INT, -- convert to boolean
resp_tc INT, -- convert to boolean
req_rd INT, -- convert to boolean
resp_ra INT, -- convert to boolean
req_z INT, -- convert to boolean
resp_ad INT, -- convert to boolean
req_cd INT, -- convert to boolean
qname STRING,
resp_ancount INT,
resp_arcount INT,
resp_nscount INT,
req_edns_ver INT,
req_edns_udp INT,
req_edns_do INT, -- convert to boolean
resp_edns_rcode INT,
req_edns_ping INT, -- convert to boolean
req_edns_dau STRING,
req_edns_dhu STRING,
req_edns_n3u STRING,
resp_edns_nsid STRING,
edns_client_subnet STRING,
edns_other STRING)
partitioned by (server STRING, batchname STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\';
3. Import CSV files into the table
Upload the data to HDFS:
hdfs dfs -mkdir -p /user/tgavenciak/andraste-import1/
hdfs dfs -put andraste-20160401-*.csv.gz /user/tgavenciak/andraste-import1/
hdfs dfs -chmod a+rwx /user/tgavenciak/andraste-import1/
Import into impala (moves the files into Impala)
alter table dnscol.csv_import add PARTITION (server="andraste", batchname="2016-08-24-tgimport1");
LOAD DATA INPATH '/user/tgavenciak/andraste-import1/' INTO TABLE dnscol.csv_import PARTITION (server="andraste", batchname="2016-08-24-tgimport1");
4. Optional: Create Entrada-like destination table (once)
CREATE TABLE dnscol.queries LIKE dns.queries STORED AS parquet;
csv_import
to queries
5. Copy the data from INSERT INTO dnscol.queries (id, unixtime, time, qname, domainname, len, frag, ttl, ipv, prot, src, srcp, dst, dstp, udp_sum, dns_len,
aa, tc, rd, ra, z, ad, cd, ancount, arcount, nscount, qdcount, opcode, rcode, qtype, qclass, country, asn, edns_udp, edns_version,
edns_do, edns_ping, edns_nsid, edns_dnssec_dau, edns_dnssec_dhu, edns_dnssec_n3u, edns_client_subnet, edns_other,
edns_client_subnet_asn, edns_client_subnet_country, labels, res_len, time_micro, resp_frag, proc_time, is_google, is_opendns,
dns_res_len, server_location)
PARTITION (year, month, day, server)
SELECT
id, -- Entrada: id INT
cast(time as bigint), -- Entrada: unixtime BIGINT
cast((time * 1000) as bigint), -- Entrada: time BIGINT
lower(qname), -- Entrada: qname STRING
regexp_extract(lower(qname), "([^.]+\\.[^.]+)\\.\\z", 1), -- Entrada: domainname STRING
req_net_len, -- Entrada: len INT
NULL, -- Entrada: frag INT
net_ttl, -- Entrada: ttl INT
net_ipv, -- Entrada: ipv INT
net_proto, -- Entrada: prot INT
client_addr, -- Entrada: src STRING
client_port, -- Entrada: srcp INT
server_addr, -- Entrada: dst STRING
server_port, -- Entrada: dstp INT
req_udp_sum, -- Entrada: udp_sum INT
req_dns_len, -- Entrada: dns_len INT
cast(resp_aa as BOOLEAN), -- Entrada: aa BOOLEAN
cast(resp_tc as BOOLEAN), -- Entrada: tc BOOLEAN
cast(req_rd as BOOLEAN), -- Entrada: rd BOOLEAN
cast(resp_ra as BOOLEAN), -- Entrada: ra BOOLEAN
cast(req_z as BOOLEAN), -- Entrada: z BOOLEAN
cast(resp_ad as BOOLEAN), -- Entrada: ad BOOLEAN
cast(req_cd as BOOLEAN), -- Entrada: cd BOOLEAN
resp_ancount, -- Entrada: ancount INT
resp_arcount, -- Entrada: arcount INT
resp_nscount, -- Entrada: nscount INT
if(length(qname) > 0, 1, 0), -- Entrada: qdcount INT
opcode, -- Entrada: opcode INT
rcode, -- Entrada: rcode INT
qtype, -- Entrada: qtype INT
qclass, -- Entrada: qclass INT
NULL, -- Entrada: country STRING
NULL, -- Entrada: asn STRING
req_edns_udp, -- Entrada: edns_udp INT
cast(req_edns_ver as SMALLINT), -- Entrada: edns_version SMALLINT
cast(req_edns_do as BOOLEAN), -- Entrada: edns_do BOOLEAN
cast(req_edns_ping as BOOLEAN), -- Entrada: edns_ping BOOLEAN
resp_edns_nsid, -- Entrada: edns_nsid STRING
req_edns_dau, -- Entrada: edns_dnssec_dau STRING
req_edns_dhu, -- Entrada: edns_dnssec_dhu STRING
req_edns_n3u, -- Entrada: edns_dnssec_n3u STRING
edns_client_subnet, -- Entrada: edns_client_subnet STRING
edns_other, -- Entrada: edns_other STRING
NULL, -- Entrada: edns_client_subnet_asn STRING
NULL, -- Entrada: edns_client_subnet_country STRING
if(length(qname) > 1, length(regexp_replace(qname, "[^\\.]", "")), 0), -- Entrada: labels INT
resp_net_len, -- Entrada: res_len INT
delay_us, -- Entrada: time_micro BIGINT
NULL, -- Entrada: resp_frag INT
NULL, -- Entrada: proc_time INT
find_in_set(client_addr, "8.8.8.8,8.8.4.4,2001:4860:4860::8888,2001:4860:4860::8844") > 0, -- Entrada: is_google BOOLEAN, TODO: update IP set, see https://developers.google.com/speed/public-dns/faq#locations
find_in_set(client_addr, "208.67.222.222,208.67.220.220") > 0, -- Entrada: is_opendns BOOLEAN, TODO: update IP set
resp_dns_len, -- Entrada: dns_res_len INT
NULL, -- Entrada: server_location STRING
extract(year from cast(time as timestamp)), -- Entrada: year INT
extract(month from cast(time as timestamp)), -- Entrada: month INT
extract(day from cast(time as timestamp)), -- Entrada: day INT
server -- Entrada: server STRING
FROM dnscol.csv_import WHERE server="andraste" AND batchname="2016-08-24-tgimport1";