|
|
|
## 1. Generate csv (or csv.gz) files via `dnscol`
|
|
|
|
|
|
|
|
```
|
|
|
|
./dnscol -C dnscol.conf /srv/andraste/*
|
|
|
|
```
|
|
|
|
|
|
|
|
## 2. Create impala table for imports (once)
|
|
|
|
|
|
|
|
```sql
|
|
|
|
create table dnscol.csv_import (
|
|
|
|
time TIMESTAMP, -- has nanosecond precision in Impala
|
|
|
|
delay_us INT,
|
|
|
|
req_dns_len INT,
|
|
|
|
resp_dns_len INT,
|
|
|
|
req_net_len INT,
|
|
|
|
resp_net_len INT,
|
|
|
|
client_addr STRING,
|
|
|
|
client_port INT,
|
|
|
|
server_addr STRING,
|
|
|
|
server_port INT,
|
|
|
|
net_proto INT,
|
|
|
|
net_ipv INT,
|
|
|
|
net_ttl INT,
|
|
|
|
req_udp_sum INT,
|
|
|
|
id INT,
|
|
|
|
qtype INT,
|
|
|
|
qclass INT,
|
|
|
|
opcode INT,
|
|
|
|
rcode INT,
|
|
|
|
resp_aa BOOLEAN,
|
|
|
|
resp_tc BOOLEAN,
|
|
|
|
req_rd BOOLEAN,
|
|
|
|
resp_ra BOOLEAN,
|
|
|
|
req_z BOOLEAN,
|
|
|
|
resp_ad BOOLEAN,
|
|
|
|
req_cd BOOLEAN,
|
|
|
|
qname STRING,
|
|
|
|
resp_ancount INT,
|
|
|
|
resp_arcount INT,
|
|
|
|
resp_nscount INT,
|
|
|
|
req_edns_ver INT,
|
|
|
|
req_edns_udp INT,
|
|
|
|
req_edns_do BOOLEAN,
|
|
|
|
resp_edns_rcode INT,
|
|
|
|
req_edns_ping BOOLEAN,
|
|
|
|
req_edns_dau STRING,
|
|
|
|
req_edns_dhu STRING,
|
|
|
|
req_edns_n3u STRING,
|
|
|
|
resp_edns_nsid STRING,
|
|
|
|
edns_client_subnet STRING,
|
|
|
|
edns_other STRING)
|
|
|
|
partitioned by (server STRING, batchname STRING)
|
|
|
|
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' ESCAPED BY '\\';
|
|
|
|
```
|
|
|
|
|
|
|
|
## 3. Import CSV files into the table
|
|
|
|
|
|
|
|
Upload the data to HDFS:
|
|
|
|
```
|
|
|
|
hdfs dfs -mkdir -p /user/tgavenciak/andraste-import1/
|
|
|
|
hdfs dfs -put andraste/andraste-*.csv.gz /user/tgavenciak/andraste-import1/
|
|
|
|
```
|
|
|
|
|
|
|
|
Import into impala (moves the files into Impala)
|
|
|
|
```sql
|
|
|
|
LOAD DATA INPATH '/user/tgavenciak/andraste-import1/' INTO TABLE dnscol.csv_import PARTITION (server="andraste", batchname="2016-08-24-tgimport1");
|
|
|
|
```
|
|
|
|
|
|
|
|
## 4. Optional: Create Entrada-like destination table (once)
|
|
|
|
|
|
|
|
```
|
|
|
|
CREATE TABLE dnscol.queries LIKE dns.queries STORED AS parquet;
|
|
|
|
```
|
|
|
|
|
|
|
|
## 5. Copy the data from `csv_import` to `queries`
|
|
|
|
|
|
|
|
```
|
|
|
|
INSERT INTO dns.queries (id, unixtime, time, qname, domainname, len, frag, ttl, ipv, prot, src, srcp, dst, dstp, udp_sum, dns_len,
|
|
|
|
aa, tc, rd, ra, z, ad, cd, ancount, arcount, nscount, qdcount, opcode, rcode, qtype, qclass, country, asn, edns_udp, edns_version,
|
|
|
|
edns_do, edns_ping, edns_nsid, edns_dnssec_dau, edns_dnssec_dhu, edns_dnssec_n3u, edns_client_subnet, edns_other,
|
|
|
|
edns_client_subnet_asn, edns_client_subnet_country, labels, res_len, time_micro, resp_frag, proc_time, is_google, is_opendns,
|
|
|
|
dns_res_len, server_location)
|
|
|
|
PARTITION (year, month, day, server)
|
|
|
|
SELECT
|
|
|
|
-- Entrada: id INT
|
|
|
|
-- Entrada: unixtime BIGINT
|
|
|
|
-- Entrada: time BIGINT
|
|
|
|
-- Entrada: qname STRING
|
|
|
|
-- Entrada: domainname STRING
|
|
|
|
-- Entrada: len INT
|
|
|
|
-- Entrada: frag INT
|
|
|
|
-- Entrada: ttl INT
|
|
|
|
-- Entrada: ipv INT
|
|
|
|
-- Entrada: prot INT
|
|
|
|
-- Entrada: src STRING
|
|
|
|
-- Entrada: srcp INT
|
|
|
|
-- Entrada: dst STRING
|
|
|
|
-- Entrada: dstp INT
|
|
|
|
-- Entrada: udp_sum INT
|
|
|
|
-- Entrada: dns_len INT
|
|
|
|
-- Entrada: aa BOOLEAN
|
|
|
|
-- Entrada: tc BOOLEAN
|
|
|
|
-- Entrada: rd BOOLEAN
|
|
|
|
-- Entrada: ra BOOLEAN
|
|
|
|
-- Entrada: z BOOLEAN
|
|
|
|
-- Entrada: ad BOOLEAN
|
|
|
|
-- Entrada: cd BOOLEAN
|
|
|
|
-- Entrada: ancount INT
|
|
|
|
-- Entrada: arcount INT
|
|
|
|
-- Entrada: nscount INT
|
|
|
|
-- Entrada: qdcount INT
|
|
|
|
-- Entrada: opcode INT
|
|
|
|
-- Entrada: rcode INT
|
|
|
|
-- Entrada: qtype INT
|
|
|
|
-- Entrada: qclass INT
|
|
|
|
-- Entrada: country STRING
|
|
|
|
-- Entrada: asn STRING
|
|
|
|
-- Entrada: edns_udp INT
|
|
|
|
-- Entrada: edns_version SMALLINT
|
|
|
|
-- Entrada: edns_do BOOLEAN
|
|
|
|
-- Entrada: edns_ping BOOLEAN
|
|
|
|
-- Entrada: edns_nsid STRING
|
|
|
|
-- Entrada: edns_dnssec_dau STRING
|
|
|
|
-- Entrada: edns_dnssec_dhu STRING
|
|
|
|
-- Entrada: edns_dnssec_n3u STRING
|
|
|
|
-- Entrada: edns_client_subnet STRING
|
|
|
|
-- Entrada: edns_other STRING
|
|
|
|
-- Entrada: edns_client_subnet_asn STRING
|
|
|
|
-- Entrada: edns_client_subnet_country STRING
|
|
|
|
-- Entrada: labels INT
|
|
|
|
-- Entrada: res_len INT
|
|
|
|
-- Entrada: time_micro BIGINT
|
|
|
|
-- Entrada: resp_frag INT
|
|
|
|
-- Entrada: proc_time INT
|
|
|
|
-- Entrada: is_google BOOLEAN
|
|
|
|
-- Entrada: is_opendns BOOLEAN
|
|
|
|
-- Entrada: dns_res_len INT
|
|
|
|
-- Entrada: server_location STRING
|
|
|
|
-- Entrada: year INT
|
|
|
|
-- Entrada: month INT
|
|
|
|
-- Entrada: day INT
|
|
|
|
-- Entrada: server STRING
|
|
|
|
FROM dnscol.csv_import WHERE server="andraste" AND batchname="2016-08-24-tgimport1";
|
|
|
|
```
|
|
|
|
|