remove hardcoded 300kb limit on spider size with config option of 1MB default

modern websites are sadly massive, 300kb is pretty low. 1MB should be enough.

Signed-off-by: strawberry <strawberry@puppygock.gay>
This commit is contained in:
strawberry 2024-02-10 13:29:12 -05:00 committed by June
parent 2ea895199a
commit 48e4b71dd1
4 changed files with 19 additions and 5 deletions

View File

@ -180,6 +180,9 @@ url_preview_domain_explicit_allowlist = []
# Setting this to "*" will allow all URL previews. Please note that this opens up significant attack surface to your server, you are expected to be aware of the risks by doing so.
url_preview_url_contains_allowlist = []
# Maximum amount of bytes allowed in a URL preview body size when spidering. Defaults to 1MB (1_000_000 bytes)
url_preview_max_spider_size = 1_000_000
### Misc

View File

@ -342,14 +342,13 @@ async fn download_image(client: &reqwest::Client, url: &str) -> Result<UrlPrevie
}
async fn download_html(client: &reqwest::Client, url: &str) -> Result<UrlPreviewData> {
let max_download_size = 300_000; // TODO: is this bytes? kilobytes? megabytes?
let mut response = client.get(url).send().await?;
let mut bytes: Vec<u8> = Vec::new();
while let Some(chunk) = response.chunk().await? {
bytes.extend_from_slice(&chunk);
if bytes.len() > max_download_size {
if bytes.len() > services().globals.url_preview_max_spider_size() {
debug!("Response body from URL {} exceeds url_preview_max_spider_size ({}), not processing the rest of the response body and assuming our necessary data is in this range.", url, services().globals.url_preview_max_spider_size());
break;
}
}

View File

@ -136,12 +136,12 @@ pub struct Config {
#[serde(default = "Vec::new")]
pub url_preview_domain_contains_allowlist: Vec<String>,
#[serde(default = "Vec::new")]
pub url_preview_domain_explicit_allowlist: Vec<String>,
#[serde(default = "Vec::new")]
pub url_preview_url_contains_allowlist: Vec<String>,
#[serde(default = "default_url_preview_max_spider_size")]
pub url_preview_max_spider_size: usize,
#[serde(default = "RegexSet::empty")]
#[serde(with = "serde_regex")]
@ -370,6 +370,10 @@ impl fmt::Display for Config {
"URL preview URL contains allowlist",
&self.url_preview_url_contains_allowlist.join(", "),
),
(
"URL preview maximum spider size",
&self.url_preview_max_spider_size.to_string(),
),
];
let mut msg: String = "Active config values:\n\n".to_owned();
@ -495,3 +499,7 @@ fn default_ip_range_denylist() -> Vec<String> {
"fec0::/10".to_owned(),
]
}
fn default_url_preview_max_spider_size() -> usize {
1_000_000 // 1MB
}

View File

@ -412,6 +412,10 @@ impl Service<'_> {
&self.config.url_preview_url_contains_allowlist
}
pub fn url_preview_max_spider_size(&self) -> usize {
self.config.url_preview_max_spider_size
}
pub fn forbidden_room_names(&self) -> &RegexSet {
&self.config.forbidden_room_names
}