diff options
author | gennyble <gen@nyble.dev> | 2025-08-19 20:24:21 -0500 |
---|---|---|
committer | gennyble <gen@nyble.dev> | 2025-08-19 20:24:21 -0500 |
commit | 645e9496b8e0d6c6b7e7e6254760e816b9fc3382 (patch) | |
tree | 7f7753dc276f4e25d4ddf814dba431c2e1934b5e | |
parent | 323a50d29815fc1b15bbacac69442f23f4bee156 (diff) | |
download | corgi-645e9496b8e0d6c6b7e7e6254760e816b9fc3382.tar.gz corgi-645e9496b8e0d6c6b7e7e6254760e816b9fc3382.zip |
rework database; 1000x uptick in corgi-stats performance
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | Cargo.lock | 13 | ||||
-rw-r--r-- | corgi/Cargo.toml | 5 | ||||
-rw-r--r-- | corgi/changelog.md | 16 | ||||
-rw-r--r-- | corgi/src/main.rs | 29 | ||||
-rw-r--r-- | corgi/src/stats.rs | 134 | ||||
-rw-r--r-- | stats/Cargo.toml | 2 | ||||
-rw-r--r-- | stats/src/main.rs | 29 |
8 files changed, 198 insertions, 32 deletions
diff --git a/.gitignore b/.gitignore index cee48f1..76a8f24 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ /target -*.sqlite +*.sqlite* *.db \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index f3bd8c4..b8d94f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,7 +82,7 @@ checksum = "ea618ded77af626818bde0f0802da7c20d47e38e23e37be40f6f807a76079e82" [[package]] name = "corgi" -version = "1.0.1" +version = "1.1.0" dependencies = [ "base64", "confindent", @@ -92,12 +92,13 @@ dependencies = [ "regex-lite", "rusqlite", "sha2", + "time", "tokio", ] [[package]] name = "corgi-stats" -version = "0.1.0" +version = "1.1.0" dependencies = [ "rusqlite", "time", @@ -524,9 +525,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.40" +version = "0.3.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d9c75b47bdff86fa3334a3db91356b8d7d86a9b839dab7d0bdc5c3d3a077618" +checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" dependencies = [ "deranged", "itoa", @@ -545,9 +546,9 @@ checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" [[package]] name = "time-macros" -version = "0.2.21" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29aa485584182073ed57fd5004aa09c371f021325014694e432313345865fd04" +checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" dependencies = [ "num-conv", "time-core", diff --git a/corgi/Cargo.toml b/corgi/Cargo.toml index df2a8ef..b2e56ba 100644 --- a/corgi/Cargo.toml +++ b/corgi/Cargo.toml @@ -6,7 +6,7 @@ license = "ISC" repository = "https://git.nyble.dev/corgi/about" readme = "../README.md" -version = "1.0.1" +version = "1.1.0" edition = "2024" [dependencies] @@ -14,8 +14,9 @@ base64 = "0.22.1" http-body-util = "0.1.3" hyper-util = { version = "0.1.10", features = ["tokio"] } regex-lite = "0.1.6" -rusqlite = { version = "0.34.0", features = ["bundled"] } +rusqlite = { version = "0.34.0", features = ["bundled", "time"] } sha2 = "0.10.8" +time = { version = "0.3.41", features = ["formatting"] } [dependencies.confindent] version = "2.2.1" diff --git a/corgi/changelog.md b/corgi/changelog.md index 8967e3e..1011e46 100644 --- a/corgi/changelog.md +++ b/corgi/changelog.md @@ -1,3 +1,19 @@ +# v1.1.0 + +this release makes changes to the sqlite database aimed to allow applications +ingesting the data to run more efficiently. in doing this, corgi-stats received +a 1000x uplift when querying against my production table with 11 million rows. + +corgi will automatically perform the database migrations necessary and fill in +initial values for the new column/table using the existing data. If you have +a lot of rows in the database, this might take a few seconds. + +- switch sqlite db to use write-ahead logging +- add a `request_count` column to the `agents` table +- create an `ephemeral_requests` table, managed by corgi, to store the last 24 hours requests separate from the main `requests` table +- create a trigger to insert a row in the `ephemeral_requests` table when `requests` receives and insert +- create a trigger to increment `request_count` in the `agents` table when a row is inserted on `requests` + # v1.0.1 - have corgi set the stats database to WAL journal mode. \ No newline at end of file diff --git a/corgi/src/main.rs b/corgi/src/main.rs index 1772d68..1192c4c 100644 --- a/corgi/src/main.rs +++ b/corgi/src/main.rs @@ -1,5 +1,10 @@ use core::fmt; -use std::{net::SocketAddr, pin::Pin, sync::Arc}; +use std::{ + net::SocketAddr, + pin::Pin, + sync::Arc, + time::{Duration, Instant}, +}; use caller::HttpRequest; use http_body_util::{BodyExt, Full}; @@ -40,11 +45,33 @@ async fn run(settings: Settings, stats: Stats) { client_addr: addr, }; + let mut last_clean = None; + loop { + // Clean at the top so we do it once on boot, but keep out of the + // flow of the request to keep it speedy. This will delay accepting + // a new connection when the clean actually runs, but that is fine. + match last_clean { + None => { + let count = svc.stats.cleanup_ephemeral_requests(); + println!("cleaned {count} requests from the ephemeral table"); + last_clean = Some(Instant::now()); + } + Some(inst) if inst.elapsed() >= Duration::from_secs(60 * 60) => { + let count = svc.stats.cleanup_ephemeral_requests(); + println!("cleaned {count} requests from the ephemeral table"); + last_clean = Some(Instant::now()); + } + _ => (), + } + + // Now we accept the connection and spawn a handler let (stream, caddr) = listen.accept().await.unwrap(); let io = TokioIo::new(stream); + let mut svc_clone = svc.clone(); svc_clone.client_addr = caddr; + tokio::task::spawn( async move { http1::Builder::new().serve_connection(io, svc_clone).await }, ); diff --git a/corgi/src/stats.rs b/corgi/src/stats.rs index 23e85a0..c407cd9 100644 --- a/corgi/src/stats.rs +++ b/corgi/src/stats.rs @@ -3,6 +3,7 @@ use std::{net::IpAddr, path::Path, sync::Mutex}; use base64::{Engine, prelude::BASE64_STANDARD_NO_PAD}; use rusqlite::{Connection, OptionalExtension, params}; use sha2::{Digest, Sha256}; +use time::{Duration, OffsetDateTime, PrimitiveDateTime}; #[derive(Debug)] pub struct Stats { @@ -21,8 +22,30 @@ impl Stats { Self::set_wal(&conn); - conn.execute(CREATE_TABLE_AGENT, ()).unwrap(); - conn.execute(CREATE_TABLE_REQUESTS, ()).unwrap(); + // "agents" exists and trigger does not; we need to alter and prime + if Self::table_exists(&conn, "agents") && !Self::trigger_exists(&conn, "agent_count") { + println!("agents table exists, but needs request_count column. Altering and priming"); + conn.execute(MIGRATE_AGENTS_ADD_REQUEST_COUNT, ()).unwrap(); + + Self::prime_agents_request_count(&conn); + } else { + conn.execute(CREATE_TABLE_AGENT, ()).unwrap(); + conn.execute(CREATE_TABLE_REQUESTS, ()).unwrap(); + } + + conn.execute(CREATE_TRIGGER_COUNT_AGENT, ()).unwrap(); + + // Instead of just an IF NOT EXISTS here, we're checking it exists + // so we can copy an initial amount of requests from the main table + // to the ephemeral table. + if !Self::table_exists(&conn, "ephemeral_requests") { + println!("ephemeral_requests does not exist. Creating and priming"); + conn.execute(CREATE_TRIGGER_EPHEMERAL, ()).unwrap(); + conn.execute(CREATE_TABLE_EPHEMERAL_REQUESTS, ()).unwrap(); + + let count = Self::prime_ephemeral_table(&conn); + println!("Primed with {count} rows"); + } } fn set_wal(conn: &Connection) { @@ -38,6 +61,32 @@ impl Stats { } } + fn table_exists(conn: &Connection, name: &str) -> bool { + let exist: Option<String> = conn + .query_row( + "SELECT name FROM sqlite_schema WHERE type='table' AND name=?1;", + params![name], + |r| r.get(0), + ) + .optional() + .unwrap(); + + exist.is_some() + } + + fn trigger_exists(conn: &Connection, name: &str) -> bool { + let exist: Option<String> = conn + .query_row( + "SELECT name FROM sqlite_schema WHERE type='trigger' AND name=?1;", + params![name], + |r| r.get(0), + ) + .optional() + .unwrap(); + + exist.is_some() + } + pub fn log_request(&self, request: Request) { let Request { agent, @@ -83,6 +132,61 @@ impl Stats { ) .unwrap(); } + + /// Small, single line function to return the lower-bound date of ephemeral + /// requests. + fn ephemeral_lifetime() -> OffsetDateTime { + OffsetDateTime::now_utc() - Duration::days(1) + } + + pub fn cleanup_ephemeral_requests(&self) -> usize { + let lower = Self::ephemeral_lifetime(); + + let sql = "DELETE FROM ephemeral_requests WHERE timestamp < ?1;"; + + let conn = self.conn.lock().unwrap(); + + match conn.execute(sql, params![lower]) { + Err(e) => { + eprintln!("ERROR failed to run ephemeral clean: {e}"); + panic!(); + } + Ok(count) => count, + } + } + + fn prime_ephemeral_table(conn: &Connection) -> usize { + let lower = Self::ephemeral_lifetime(); + + let sql = "INSERT INTO ephemeral_requests SELECT id, timestamp FROM requests WHERE timestamp > ?1;"; + match conn.execute(sql, params![lower]) { + Err(e) => { + eprintln!("ERROR failed to prime ephemeral: {e}"); + panic!(); + } + Ok(count) => count, + } + } + + fn prime_agents_request_count(conn: &Connection) { + let sql = "SELECT agent_id, count(id) as count FROM requests GROUP BY agent_id"; + let mut prepared = conn.prepare(sql).unwrap(); + + let counts: Vec<(i64, i64)> = prepared + .query_map((), |row| Ok((row.get(0)?, row.get(1)?))) + .optional() + .unwrap() + .map(|iter| iter.map(|e| e.unwrap()).collect()) + .unwrap(); + + for (agent, count) in counts { + conn.execute( + "UPDATE agents SET request_count = ?1 WHERE id = ?2;", + params![count, agent], + ) + .unwrap(); + } + } } pub struct Request<'r> { @@ -96,9 +200,13 @@ const CREATE_TABLE_AGENT: &'static str = "\ CREATE TABLE IF NOT EXISTS agents( id INTEGER PRIMARY KEY AUTOINCREMENT, hash TEXT NOT NULL, - agent TEXT NOT NULL + agent TEXT NOT NULL, + request_count INTEGER NOT NULL DEFAULT 0 );"; +const MIGRATE_AGENTS_ADD_REQUEST_COUNT: &'static str = + "ALTER TABLE agents ADD COLUMN request_count INTEGER NOT NULL DEFAULT 0"; + const CREATE_TABLE_REQUESTS: &'static str = "\ CREATE TABLE IF NOT EXISTS requests( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -110,3 +218,23 @@ const CREATE_TABLE_REQUESTS: &'static str = "\ FOREIGN KEY (agent_id) REFERENCES agents(id) );"; + +const CREATE_TRIGGER_EPHEMERAL: &'static str = "\ + CREATE TRIGGER IF NOT EXISTS requests_copy_ephemeral AFTER INSERT ON requests + BEGIN + INSERT INTO ephemeral_requests(request_id, timestamp) VALUES(new.id, new.timestamp); + END;"; + +const CREATE_TRIGGER_COUNT_AGENT: &'static str = "\ + CREATE TRIGGER IF NOT EXISTS agent_count AFTER INSERT ON requests + BEGIN + UPDATE agents SET request_count = request_count + 1 WHERE agents.id = new.agent_id; + END;"; + +const CREATE_TABLE_EPHEMERAL_REQUESTS: &'static str = "\ + CREATE TABLE IF NOT EXISTS ephemeral_requests( + request_id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + FOREIGN KEY (request_id) + REFERENCES requests(id) + );"; diff --git a/stats/Cargo.toml b/stats/Cargo.toml index a012b94..c7e36b1 100644 --- a/stats/Cargo.toml +++ b/stats/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "corgi-stats" -version = "0.1.0" +version = "1.1.0" edition = "2024" [dependencies] diff --git a/stats/src/main.rs b/stats/src/main.rs index 7e3f922..5b83bdd 100644 --- a/stats/src/main.rs +++ b/stats/src/main.rs @@ -3,18 +3,16 @@ use std::{io::Write, time::Instant}; use rusqlite::{Connection, params}; use time::{Duration, OffsetDateTime}; -// Thank you, cat, for optimizing my query -const TOP_TEN_ALL_TIME: &str = "\ - SELECT reqs.cnt, agents.agent - FROM agents - JOIN ( - SELECT count(id) as cnt, agent_id - FROM requests - GROUP BY agent_id - ) reqs - ON reqs.agent_id=agents.id - ORDER BY reqs.cnt DESC LIMIT 10; -"; +const TOP_TEN_ALL_TIME: &str = + "SELECT request_count, agent FROM agents ORDER BY request_count DESC LIMIT 10;"; + +const LAST_FIFTEEN_MINUTES: &str = "\ + SELECT count(ephemeral_requests.request_id) as request_count, agents.agent + FROM ephemeral_requests + INNER JOIN requests ON ephemeral_requests.request_id = requests.id + INNER JOIN agents ON requests.agent_id = agents.id + WHERE ephemeral_requests.timestamp > ?1 + GROUP BY requests.agent_id;"; const STYLE: &'static str = include_str!("style.css"); const FAVICON: &'static [u8] = include_bytes!("favicon.gif"); @@ -48,13 +46,8 @@ fn main() { let now = OffsetDateTime::now_utc(); let fifteen_ago = now - Duration::minutes(15); - let query = "SELECT count(requests.id) AS request_count, agents.agent FROM requests \ - INNER JOIN agents ON requests.agent_id = agents.id \ - WHERE requests.timestamp > ?1 \ - GROUP BY requests.agent_id;"; - let start = Instant::now(); - let mut prepared = db.prepare(query).unwrap(); + let mut prepared = db.prepare(LAST_FIFTEEN_MINUTES).unwrap(); let mut agents: Vec<(usize, String)> = prepared .query_map(params![fifteen_ago], |row| Ok((row.get(0)?, row.get(1)?))) .unwrap() |