From c0dd5b1cc23ae54a321b7d2946fe4a513c262d69 Mon Sep 17 00:00:00 2001 From: Reiner Herrmann Date: Fri, 9 Feb 2024 23:16:06 -0500 Subject: [PATCH] feat: URL preview support from upstream MR https://gitlab.com/famedly/conduit/-/merge_requests/347 with the following changes (so far): - remove hardcoded list of allowed hosts (strongly disagree with this, even if it is desired, it should not be harcoded) - add more allow config options for granularity via URL contains, host contains, and domain is (explicit match) for security - warn if a user is allowing all URLs to be previewed for security reasons - replace an expect with proper error handling - bump webpage to 2.0 - improved code style a tad Co-authored-by: rooot Signed-off-by: rooot Signed-off-by: strawberry --- Cargo.lock | 242 +++++++++++++++++++++--- Cargo.toml | 4 +- conduwuit-example.toml | 17 ++ debian/postinst | 34 +++- src/api/client_server/media.rs | 313 +++++++++++++++++++++++++++++++- src/config/mod.rs | 21 +++ src/database/key_value/media.rs | 114 +++++++++++- src/database/mod.rs | 2 + src/main.rs | 25 +++ src/service/globals/mod.rs | 12 ++ src/service/media/data.rs | 11 ++ src/service/media/mod.rs | 60 +++++- src/service/mod.rs | 7 +- 13 files changed, 821 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 991c7e1b..ef96dde8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -102,7 +102,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -234,7 +234,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn", + "syn 2.0.48", ] [[package]] @@ -340,9 +340,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.4.18" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" +checksum = "80c21025abd42669a92efc996ef13cfb2c5c627858421ea58d5c3b331a6c134f" dependencies = [ "clap_builder", "clap_derive", @@ -350,9 +350,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.18" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" +checksum = "458bf1f341769dfcf849846f65dffdf9146daa56bcd2a47cb4e1de9915567c99" dependencies = [ "anstyle", "clap_lex", @@ -360,21 +360,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.4.7" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] name = "clap_lex" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "color_quant" @@ -440,6 +440,7 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "trust-dns-resolver", + "webpage", ] [[package]] @@ -538,7 +539,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -647,7 +648,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -722,6 +723,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures-channel" version = "0.3.30" @@ -762,7 +773,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -937,6 +948,20 @@ dependencies = [ "winapi", ] +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "http" version = "0.2.11" @@ -1335,12 +1360,44 @@ dependencies = [ "libc", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "maplit" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -1401,6 +1458,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + [[package]] name = "nix" version = "0.27.1" @@ -1692,7 +1755,7 @@ dependencies = [ "proc-macro2", "proc-macro2-diagnostics", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1717,6 +1780,44 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.4" @@ -1734,7 +1835,7 @@ checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -1796,6 +1897,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettyplease" version = "0.2.16" @@ -1803,7 +1910,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.48", ] [[package]] @@ -1833,7 +1940,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", "version_check", "yansi", ] @@ -2165,7 +2272,7 @@ dependencies = [ "quote", "ruma-identifiers-validation", "serde", - "syn", + "syn 2.0.48", "toml", ] @@ -2378,7 +2485,7 @@ checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -2543,6 +2650,12 @@ dependencies = [ "time", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "slab" version = "0.4.9" @@ -2584,6 +2697,32 @@ dependencies = [ "der", ] +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "subslice" version = "0.2.3" @@ -2599,6 +2738,17 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.48" @@ -2637,6 +2787,17 @@ dependencies = [ "libc", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.56" @@ -2654,7 +2815,7 @@ checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -2790,7 +2951,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -2944,7 +3105,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] @@ -3147,6 +3308,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "uuid" version = "1.7.0" @@ -3210,7 +3377,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.48", "wasm-bindgen-shared", ] @@ -3244,7 +3411,7 @@ checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3275,6 +3442,18 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpage" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb86b12e58d490a99867f561ce8466ffa7b73e24d015a8e7f5bc111d4424ba2" +dependencies = [ + "html5ever", + "markup5ever_rcdom", + "serde_json", + "url", +] + [[package]] name = "weezl" version = "0.1.8" @@ -3466,6 +3645,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] + [[package]] name = "yansi" version = "1.0.0-rc.1" @@ -3489,7 +3679,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.48", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3b79ad69..1c800730 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,7 +91,7 @@ hmac = "0.12.1" sha-1 = "0.10.1" sha2 = { version = "0.10.8" } # used for conduit's CLI and admin room command parsing -clap = { version = "4.4.17", default-features = false, features = ["std", "derive", "help", "usage", "error-context"] } +clap = { version = "4.5.0", default-features = false, features = ["std", "derive", "help", "usage", "error-context"] } futures-util = { version = "0.3.30", default-features = false } # Used for reading the configuration from conduit.toml & environment variables figment = { version = "0.10.14", features = ["env", "toml"] } @@ -106,6 +106,8 @@ ipaddress = "0.1.3" sd-notify = { version = "0.4.1", optional = true } +webpage = { version = "2.0", default-features = false } + [target.'cfg(unix)'.dependencies] nix = { version = "0.27.1", features = ["resource"] } diff --git a/conduwuit-example.toml b/conduwuit-example.toml index a8a01609..2f6d5f5c 100644 --- a/conduwuit-example.toml +++ b/conduwuit-example.toml @@ -113,6 +113,8 @@ ip_range_denylist = [ "fec0::/10", ] + + ### Moderation / Privacy / Security # Set to true to allow user type "guest" registrations. Element attempts to register guest users automatically. @@ -163,6 +165,21 @@ allow_public_room_directory_without_auth = false # If federation is disabled entirely (`allow_federation`), this is inherently false. For privacy, this is best disabled. allow_device_name_federation = false +# Vector list of domains allowed to send requests to for URL previews. Defaults to none. +# Note: this is a *contains* match, not an explicit match. Putting "google.com" will match "https://google.com" and "http://mymaliciousdomainexamplegoogle.com" +# Setting this to "*" will allow all URL previews. Please note that this opens up significant attack surface to your server, you are expected to be aware of the risks by doing so. +url_preview_domain_contains_allowlist = [] + +# Vector list of explicit domains allowed to send requests to for URL previews. Defaults to none. +# Note: This is an *explicit* match, not a ccontains match. Putting "google.com" will match "https://google.com", "http://google.com", but not "https://mymaliciousdomainexamplegoogle.com" +# Setting this to "*" will allow all URL previews. Please note that this opens up significant attack surface to your server, you are expected to be aware of the risks by doing so. +url_preview_domain_explicit_allowlist = [] + +# Vector list of URLs allowed to send requests to for URL previews. Defaults to none. +# Note that this is a *contains* match, not an explicit match. Putting "https://google.com" will match "https://google.com/" and "https://google.com/url?q=https://mymaliciousdomainexample.com" +# Setting this to "*" will allow all URL previews. Please note that this opens up significant attack surface to your server, you are expected to be aware of the risks by doing so. +url_preview_url_contains_allowlist = [] + ### Misc diff --git a/debian/postinst b/debian/postinst index 630e4432..7f588689 100644 --- a/debian/postinst +++ b/debian/postinst @@ -92,12 +92,20 @@ port = ${CONDUIT_PORT} # likely need this to be 0.0.0.0. address = "${CONDUIT_ADDRESS}" -# How many requests conduwuit sends to other servers at the same time. Default is 100 -# Note that because conduwuit is very fast unlike other homeserver implementations, -# setting this too high could inadvertently result in ratelimits kicking in, or -# overloading lower-end homeservers out there. Recommended to leave this alone unless you -# have a valid reason to. No this will not speed up room joins. -#max_concurrent_requests = 100 +# How many requests conduwuit sends to other servers at the same time concurrently. Default is 500 +# Note that because conduwuit is very fast unlike other homeserver implementations, setting this too +# high could inadvertently result in ratelimits kicking in, or overloading lower-end homeservers out there. +# +# A valid use-case for enabling this is if you have a significant amount of overall federation activity +# such as many rooms joined/tracked, and many servers in the true destination cache caused by that. Upon +# rebooting conduwuit, depending on how fast your resources are, client and incoming federation requests +# may timeout or be "stalled" for a period of time due to hitting the max concurrent requests limit from +# refreshing federation/destination caches and such. +# +# If you have a lot of active users on your homeserver, you will definitely need to raise this. +# +# No this will not speed up room joins. +#max_concurrent_requests = 500 # Max request size for file uploads max_request_size = 20_000_000 # in bytes @@ -142,6 +150,8 @@ ip_range_denylist = [ "fec0::/10", ] + + ### Moderation / Privacy / Security # Set to true to allow user type "guest" registrations. Element attempts to register guest users automatically. @@ -192,6 +202,18 @@ allow_public_room_directory_without_auth = false # If federation is disabled entirely (`allow_federation`), this is inherently false. For privacy, this is best disabled. allow_device_name_federation = false +# Vector list of domains allowed to send requests to for URL previews. Defaults to none. +# Note: this is a *contains* match, not an explicit match. Putting "google.com" will match "https://google.com" and "http://mymaliciousdomainexamplegoogle.com" +url_preview_domain_contains_allowlist = [] + +# Vector list of explicit domains allowed to send requests to for URL previews. Defaults to none. +# Note: This is an *explicit* match, not a ccontains match. Putting "google.com" will match "https://google.com", "http://google.com", but not "https://mymaliciousdomainexamplegoogle.com" +url_preview_domain_explicit_allowlist = [] + +# Vector list of URLs allowed to send requests to for URL previews. Defaults to none. +# Note that this is a *contains* match, not an explicit match. Putting "https://google.com" will match "https://google.com/" and "https://google.com/url?q=https://mymaliciousdomainexample.com" +url_preview_url_contains_allowlist = [] + ### Misc diff --git a/src/api/client_server/media.rs b/src/api/client_server/media.rs index e36d700d..7c5375c3 100644 --- a/src/api/client_server/media.rs +++ b/src/api/client_server/media.rs @@ -1,14 +1,21 @@ -use std::time::Duration; +use std::{io::Cursor, net::IpAddr, sync::Arc, time::Duration}; -use crate::{service::media::FileMeta, services, utils, Error, Result, Ruma}; +use crate::{ + service::media::{FileMeta, UrlPreviewData}, + services, utils, Error, Result, Ruma, +}; +use image::io::Reader as ImgReader; + +use reqwest::Url; use ruma::api::client::{ error::ErrorKind, media::{ create_content, get_content, get_content_as_filename, get_content_thumbnail, - get_media_config, + get_media_config, get_media_preview, }, }; -use tracing::info; +use tracing::{debug, error, info}; +use webpage::HTML; /// generated MXC ID (`media-id`) length const MXC_LENGTH: usize = 32; @@ -24,6 +31,43 @@ pub async fn get_media_config_route( }) } +/// # `GET /_matrix/media/v3/preview_url` +/// +/// Returns URL preview. +pub async fn get_media_preview_route( + body: Ruma, +) -> Result { + let url = &body.url; + if !url_preview_allowed(url) { + return Err(Error::BadRequest( + ErrorKind::Forbidden, + "URL is not allowed to be previewed", + )); + } + + if let Ok(preview) = get_url_preview(url).await { + let res = serde_json::value::to_raw_value(&preview).map_err(|e| { + error!( + "Failed to convert UrlPreviewData into a serde json value: {}", + e + ); + Error::BadRequest( + ErrorKind::Unknown, + "Unknown error occurred parsing URL preview", + ) + })?; + + return Ok(get_media_preview::v3::Response::from_raw_value(res)); + } + + Err(Error::BadRequest( + ErrorKind::LimitExceeded { + retry_after_ms: Some(Duration::from_secs(5)), + }, + "Retry later", + )) +} + /// # `POST /_matrix/media/v3/upload` /// /// Permanently save media in the server. @@ -266,3 +310,264 @@ pub async fn get_content_thumbnail_route( Err(Error::BadRequest(ErrorKind::NotFound, "Media not found.")) } } + +async fn download_image(client: &reqwest::Client, url: &str) -> Result { + let image = client.get(url).send().await?.bytes().await?; + let mxc = format!( + "mxc://{}/{}", + services().globals.server_name(), + utils::random_string(MXC_LENGTH) + ); + + services() + .media + .create(mxc.clone(), None, None, &image) + .await?; + + let (width, height) = match ImgReader::new(Cursor::new(&image)).with_guessed_format() { + Err(_) => (None, None), + Ok(reader) => match reader.into_dimensions() { + Err(_) => (None, None), + Ok((width, height)) => (Some(width), Some(height)), + }, + }; + + Ok(UrlPreviewData { + image: Some(mxc), + image_size: Some(image.len()), + image_width: width, + image_height: height, + ..Default::default() + }) +} + +async fn download_html(client: &reqwest::Client, url: &str) -> Result { + let max_download_size = 300_000; // TODO: is this bytes? kilobytes? megabytes? + + let mut response = client.get(url).send().await?; + + let mut bytes: Vec = Vec::new(); + while let Some(chunk) = response.chunk().await? { + bytes.extend_from_slice(&chunk); + if bytes.len() > max_download_size { + break; + } + } + let body = String::from_utf8_lossy(&bytes); + let html = match HTML::from_string(body.to_string(), Some(url.to_owned())) { + Ok(html) => html, + Err(_) => { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Failed to parse HTML", + )) + } + }; + + let mut data = match html.opengraph.images.first() { + None => UrlPreviewData::default(), + Some(obj) => download_image(client, &obj.url).await?, + }; + + let props = html.opengraph.properties; + + /* use OpenGraph title/description, but fall back to HTML if not available */ + data.title = props.get("title").cloned().or(html.title); + data.description = props.get("description").cloned().or(html.description); + + Ok(data) +} + +fn url_request_allowed(addr: &IpAddr) -> bool { + // TODO: make this check ip_range_denylist + + // could be implemented with reqwest when it supports IP filtering: + // https://github.com/seanmonstar/reqwest/issues/1515 + + // These checks have been taken from the Rust core/net/ipaddr.rs crate, + // IpAddr::V4.is_global() and IpAddr::V6.is_global(), as .is_global is not + // yet stabilized. TODO: Once this is stable, this match can be simplified. + match addr { + IpAddr::V4(ip4) => { + !(ip4.octets()[0] == 0 // "This network" + || ip4.is_private() + || (ip4.octets()[0] == 100 && (ip4.octets()[1] & 0b1100_0000 == 0b0100_0000)) // is_shared() + || ip4.is_loopback() + || ip4.is_link_local() + // addresses reserved for future protocols (`192.0.0.0/24`) + || (ip4.octets()[0] == 192 && ip4.octets()[1] == 0 && ip4.octets()[2] == 0) + || ip4.is_documentation() + || (ip4.octets()[0] == 198 && (ip4.octets()[1] & 0xfe) == 18) // is_benchmarking() + || (ip4.octets()[0] & 240 == 240 && !ip4.is_broadcast()) // is_reserved() + || ip4.is_broadcast()) + } + IpAddr::V6(ip6) => { + !(ip6.is_unspecified() + || ip6.is_loopback() + // IPv4-mapped Address (`::ffff:0:0/96`) + || matches!(ip6.segments(), [0, 0, 0, 0, 0, 0xffff, _, _]) + // IPv4-IPv6 Translat. (`64:ff9b:1::/48`) + || matches!(ip6.segments(), [0x64, 0xff9b, 1, _, _, _, _, _]) + // Discard-Only Address Block (`100::/64`) + || matches!(ip6.segments(), [0x100, 0, 0, 0, _, _, _, _]) + // IETF Protocol Assignments (`2001::/23`) + || (matches!(ip6.segments(), [0x2001, b, _, _, _, _, _, _] if b < 0x200) + && !( + // Port Control Protocol Anycast (`2001:1::1`) + u128::from_be_bytes(ip6.octets()) == 0x2001_0001_0000_0000_0000_0000_0000_0001 + // Traversal Using Relays around NAT Anycast (`2001:1::2`) + || u128::from_be_bytes(ip6.octets()) == 0x2001_0001_0000_0000_0000_0000_0000_0002 + // AMT (`2001:3::/32`) + || matches!(ip6.segments(), [0x2001, 3, _, _, _, _, _, _]) + // AS112-v6 (`2001:4:112::/48`) + || matches!(ip6.segments(), [0x2001, 4, 0x112, _, _, _, _, _]) + // ORCHIDv2 (`2001:20::/28`) + || matches!(ip6.segments(), [0x2001, b, _, _, _, _, _, _] if (0x20..=0x2F).contains(&b)) + )) + || ((ip6.segments()[0] == 0x2001) && (ip6.segments()[1] == 0xdb8)) // is_documentation() + || ((ip6.segments()[0] & 0xfe00) == 0xfc00) // is_unique_local() + || ((ip6.segments()[0] & 0xffc0) == 0xfe80)) // is_unicast_link_local + } + } +} + +async fn request_url_preview(url: &str) -> Result { + let client = services().globals.default_client(); + let response = client.head(url).send().await?; + + if !response + .remote_addr() + .map_or(false, |a| url_request_allowed(&a.ip())) + { + return Err(Error::BadRequest( + ErrorKind::Forbidden, + "Requesting from this address is forbidden", + )); + } + + let content_type = match response + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|x| x.to_str().ok()) + { + Some(ct) => ct, + None => { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Unknown Content-Type", + )) + } + }; + let data = match content_type { + html if html.starts_with("text/html") => download_html(&client, url).await?, + img if img.starts_with("image/") => download_image(&client, url).await?, + _ => { + return Err(Error::BadRequest( + ErrorKind::Unknown, + "Unsupported Content-Type", + )) + } + }; + + services().media.set_url_preview(url, &data).await?; + + Ok(data) +} + +async fn get_url_preview(url: &str) -> Result { + if let Some(preview) = services().media.get_url_preview(url).await { + return Ok(preview); + } + + // ensure that only one request is made per URL + let mutex_request = Arc::clone( + services() + .media + .url_preview_mutex + .write() + .unwrap() + .entry(url.to_owned()) + .or_default(), + ); + let _request_lock = mutex_request.lock().await; + + match services().media.get_url_preview(url).await { + Some(preview) => Ok(preview), + None => request_url_preview(url).await, + } +} + +fn url_preview_allowed(url_str: &str) -> bool { + let url: Url = match Url::parse(url_str) { + Ok(u) => u, + Err(_) => return false, + }; + + if ["http", "https"] + .iter() + .all(|&scheme| scheme != url.scheme().to_lowercase()) + { + debug!("Ignoring non-HTTP/HTTPS URL to preview: {}", url); + return false; + } + + let host = match url.host_str() { + None => { + debug!( + "Ignoring URL preview for a URL that does not have a host (?): {}", + url + ); + return false; + } + Some(h) => h.to_owned(), + }; + + let allowlist_domain_contains = services().globals.url_preview_domain_contains_allowlist(); + let allowlist_domain_explicit = services().globals.url_preview_domain_explicit_allowlist(); + let allowlist_url_contains = services().globals.url_preview_url_contains_allowlist(); + + if allowlist_domain_contains.contains(&"*".to_owned()) + || allowlist_domain_explicit.contains(&"*".to_owned()) + || allowlist_url_contains.contains(&"*".to_owned()) + { + debug!( + "Config key contains * which is allowing all URL previews. Allowing URL {}", + url + ); + return true; + } + + if !host.is_empty() { + if allowlist_domain_explicit.contains(&host) { + return true; + } + debug!( + "Host {} is allowed by url_preview_domain_explicit_allowlist (check 1/3)", + &host + ); + + if allowlist_domain_contains + .iter() + .any(|domain_s| domain_s.contains(&host.clone())) + { + return true; + } + debug!( + "Host {} is allowed by url_preview_domain_contains_allowlist (check 2/3)", + &host + ); + + if allowlist_url_contains + .iter() + .any(|url_s| url_s.contains(&url.to_string())) + { + return true; + } + debug!( + "URL {} is allowed by url_preview_url_contains_allowlist (check 3/3)", + &host + ); + } + + false +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 184d1635..ddb1c654 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -134,6 +134,15 @@ pub struct Config { #[serde(default = "default_ip_range_denylist")] pub ip_range_denylist: Vec, + #[serde(default = "Vec::new")] + pub url_preview_domain_contains_allowlist: Vec, + + #[serde(default = "Vec::new")] + pub url_preview_domain_explicit_allowlist: Vec, + + #[serde(default = "Vec::new")] + pub url_preview_url_contains_allowlist: Vec, + #[serde(default = "RegexSet::empty")] #[serde(with = "serde_regex")] pub forbidden_room_names: RegexSet, @@ -349,6 +358,18 @@ impl fmt::Display for Config { ("Forbidden room names", { &self.forbidden_room_names.patterns().iter().join(", ") }), + ( + "URL preview domain contains allowlist", + &self.url_preview_domain_contains_allowlist.join(", "), + ), + ( + "URL preview domain explicit allowlist", + &self.url_preview_domain_explicit_allowlist.join(", "), + ), + ( + "URL preview URL contains allowlist", + &self.url_preview_url_contains_allowlist.join(", "), + ), ]; let mut msg: String = "Active config values:\n\n".to_owned(); diff --git a/src/database/key_value/media.rs b/src/database/key_value/media.rs index 6abe5ba5..712da165 100644 --- a/src/database/key_value/media.rs +++ b/src/database/key_value/media.rs @@ -1,6 +1,10 @@ use ruma::api::client::error::ErrorKind; -use crate::{database::KeyValueDatabase, service, utils, Error, Result}; +use crate::{ + database::KeyValueDatabase, + service::{self, media::UrlPreviewData}, + utils, Error, Result, +}; impl service::media::Data for KeyValueDatabase { fn create_file_metadata( @@ -79,4 +83,112 @@ impl service::media::Data for KeyValueDatabase { }; Ok((content_disposition, content_type, key)) } + + fn remove_url_preview(&self, url: &str) -> Result<()> { + self.url_previews.remove(url.as_bytes()) + } + + fn set_url_preview( + &self, + url: &str, + data: &UrlPreviewData, + timestamp: std::time::Duration, + ) -> Result<()> { + let mut value = Vec::::new(); + value.extend_from_slice(×tamp.as_secs().to_be_bytes()); + value.push(0xff); + value.extend_from_slice( + data.title + .as_ref() + .map(|t| t.as_bytes()) + .unwrap_or_default(), + ); + value.push(0xff); + value.extend_from_slice( + data.description + .as_ref() + .map(|d| d.as_bytes()) + .unwrap_or_default(), + ); + value.push(0xff); + value.extend_from_slice( + data.image + .as_ref() + .map(|i| i.as_bytes()) + .unwrap_or_default(), + ); + value.push(0xff); + value.extend_from_slice(&data.image_size.unwrap_or(0).to_be_bytes()); + value.push(0xff); + value.extend_from_slice(&data.image_width.unwrap_or(0).to_be_bytes()); + value.push(0xff); + value.extend_from_slice(&data.image_height.unwrap_or(0).to_be_bytes()); + + self.url_previews.insert(url.as_bytes(), &value) + } + + fn get_url_preview(&self, url: &str) -> Option { + let values = self.url_previews.get(url.as_bytes()).ok()??; + + let mut values = values.split(|&b| b == 0xff); + + let _ts = match values + .next() + .map(|b| u64::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + let title = match values + .next() + .and_then(|b| String::from_utf8(b.to_vec()).ok()) + { + Some(s) if s.is_empty() => None, + x => x, + }; + let description = match values + .next() + .and_then(|b| String::from_utf8(b.to_vec()).ok()) + { + Some(s) if s.is_empty() => None, + x => x, + }; + let image = match values + .next() + .and_then(|b| String::from_utf8(b.to_vec()).ok()) + { + Some(s) if s.is_empty() => None, + x => x, + }; + let image_size = match values + .next() + .map(|b| usize::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + let image_width = match values + .next() + .map(|b| u32::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + let image_height = match values + .next() + .map(|b| u32::from_be_bytes(b.try_into().expect("valid BE array"))) + { + Some(0) => None, + x => x, + }; + + Some(UrlPreviewData { + title, + description, + image, + image_size, + image_width, + image_height, + }) + } } diff --git a/src/database/mod.rs b/src/database/mod.rs index 818682da..d6171208 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -147,6 +147,7 @@ pub struct KeyValueDatabase { //pub media: media::Media, pub(super) mediaid_file: Arc, // MediaId = MXC + WidthHeight + ContentDisposition + ContentType + pub(super) url_previews: Arc, //pub key_backups: key_backups::KeyBackups, pub(super) backupid_algorithm: Arc, // BackupId = UserId + Version(Count) pub(super) backupid_etag: Arc, // BackupId = UserId + Version(Count) @@ -350,6 +351,7 @@ impl KeyValueDatabase { roomuserdataid_accountdata: builder.open_tree("roomuserdataid_accountdata")?, roomusertype_roomuserdataid: builder.open_tree("roomusertype_roomuserdataid")?, mediaid_file: builder.open_tree("mediaid_file")?, + url_previews: builder.open_tree("url_previews")?, backupid_algorithm: builder.open_tree("backupid_algorithm")?, backupid_etag: builder.open_tree("backupid_etag")?, backupkeyid_backup: builder.open_tree("backupkeyid_backup")?, diff --git a/src/main.rs b/src/main.rs index 314e978c..2a949ec8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -148,8 +148,11 @@ async fn main() { error!(?error, "The database couldn't be loaded or created"); return; }; + let config = &services().globals.config; + /* ad-hoc config validation/checks */ + // check if user specified valid IP CIDR ranges on startup for cidr in services().globals.ip_range_denylist() { let _ = ipaddress::IPAddress::parse(cidr) @@ -179,6 +182,27 @@ async fn main() { warn!("! Outgoing federated presence is not spec compliant due to relying on PDUs and EDUs combined.\nOutgoing presence will not be very reliable due to this and any issues with federated outgoing presence are very likely attributed to this issue.\nIncoming presence and local presence are unaffected."); } + if config + .url_preview_domain_contains_allowlist + .contains(&"*".to_owned()) + { + warn!("All URLs are allowed for URL previews via setting \"url_preview_domain_contains_allowlist\" to \"*\". This opens up significant attack surface to your server. You are expected to be aware of the risks by doing this."); + } + if config + .url_preview_domain_explicit_allowlist + .contains(&"*".to_owned()) + { + warn!("All URLs are allowed for URL previews via setting \"url_preview_domain_explicit_allowlist\" to \"*\". This opens up significant attack surface to your server. You are expected to be aware of the risks by doing this."); + } + if config + .url_preview_url_contains_allowlist + .contains(&"*".to_owned()) + { + warn!("All URLs are allowed for URL previews via setting \"url_preview_url_contains_allowlist\" to \"*\". This opens up significant attack surface to your server. You are expected to be aware of the risks by doing this."); + } + + /* end ad-hoc config validation/checks */ + info!("Starting server"); if let Err(e) = run_server().await { error!("Critical error running server: {}", e); @@ -464,6 +488,7 @@ fn routes() -> Router { .ruma_route(client_server::turn_server_route) .ruma_route(client_server::send_event_to_device_route) .ruma_route(client_server::get_media_config_route) + .ruma_route(client_server::get_media_preview_route) .ruma_route(client_server::create_content_route) .ruma_route(client_server::get_content_route) .ruma_route(client_server::get_content_as_filename_route) diff --git a/src/service/globals/mod.rs b/src/service/globals/mod.rs index 434a8c4c..e1173f6f 100644 --- a/src/service/globals/mod.rs +++ b/src/service/globals/mod.rs @@ -390,6 +390,18 @@ impl Service<'_> { &self.config.emergency_password } + pub fn url_preview_domain_contains_allowlist(&self) -> &Vec { + &self.config.url_preview_domain_contains_allowlist + } + + pub fn url_preview_domain_explicit_allowlist(&self) -> &Vec { + &self.config.url_preview_domain_explicit_allowlist + } + + pub fn url_preview_url_contains_allowlist(&self) -> &Vec { + &self.config.url_preview_url_contains_allowlist + } + pub fn forbidden_room_names(&self) -> &RegexSet { &self.config.forbidden_room_names } diff --git a/src/service/media/data.rs b/src/service/media/data.rs index 75a682cb..0cb7c097 100644 --- a/src/service/media/data.rs +++ b/src/service/media/data.rs @@ -17,4 +17,15 @@ pub trait Data: Send + Sync { width: u32, height: u32, ) -> Result<(Option, Option, Vec)>; + + fn remove_url_preview(&self, url: &str) -> Result<()>; + + fn set_url_preview( + &self, + url: &str, + data: &super::UrlPreviewData, + timestamp: std::time::Duration, + ) -> Result<()>; + + fn get_url_preview(&self, url: &str) -> Option; } diff --git a/src/service/media/mod.rs b/src/service/media/mod.rs index 4a016bda..5cbb07c2 100644 --- a/src/service/media/mod.rs +++ b/src/service/media/mod.rs @@ -1,7 +1,13 @@ mod data; -use std::io::Cursor; +use std::{ + collections::HashMap, + io::Cursor, + sync::{Arc, RwLock}, + time::SystemTime, +}; pub(crate) use data::Data; +use serde::Serialize; use crate::{services, Result}; use image::imageops::FilterType; @@ -9,6 +15,7 @@ use image::imageops::FilterType; use tokio::{ fs::File, io::{AsyncReadExt, AsyncWriteExt, BufReader}, + sync::Mutex, }; pub struct FileMeta { @@ -17,8 +24,43 @@ pub struct FileMeta { pub file: Vec, } +#[derive(Serialize, Default)] +pub struct UrlPreviewData { + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:title") + )] + pub title: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:description") + )] + pub description: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:image") + )] + pub image: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "matrix:image:size") + )] + pub image_size: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:image:width") + )] + pub image_width: Option, + #[serde( + skip_serializing_if = "Option::is_none", + rename(serialize = "og:image:height") + )] + pub image_height: Option, +} + pub struct Service { pub db: &'static dyn Data, + pub url_preview_mutex: RwLock>>>, } impl Service { @@ -260,6 +302,22 @@ impl Service { Ok(None) } } + + pub async fn get_url_preview(&self, url: &str) -> Option { + self.db.get_url_preview(url) + } + + pub async fn remove_url_preview(&self, url: &str) -> Result<()> { + // TODO: also remove the downloaded image + self.db.remove_url_preview(url) + } + + pub async fn set_url_preview(&self, url: &str, data: &UrlPreviewData) -> Result<()> { + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .expect("valid system time"); + self.db.set_url_preview(url, data, now) + } } #[cfg(test)] diff --git a/src/service/mod.rs b/src/service/mod.rs index 0597c211..1902fa8c 100644 --- a/src/service/mod.rs +++ b/src/service/mod.rs @@ -1,6 +1,6 @@ use std::{ collections::{BTreeMap, HashMap}, - sync::{Arc, Mutex}, + sync::{Arc, Mutex, RwLock}, }; use lru_cache::LruCache; @@ -114,7 +114,10 @@ impl Services<'_> { account_data: account_data::Service { db }, admin: admin::Service::build(), key_backups: key_backups::Service { db }, - media: media::Service { db }, + media: media::Service { + db, + url_preview_mutex: RwLock::new(HashMap::new()), + }, sending: sending::Service::build(db, &config), globals: globals::Service::load(db, config)?,