From c38766c5a60a918b6716c1dc1d343ea47cbc2255 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Wed, 8 May 2024 22:13:24 -0500 Subject: [PATCH 01/24] (search, WIP) Convert SCSS variables to CSS custom properties for dynamic theming --- .../resources/static/search/serp.scss | 204 +++++++++++------- 1 file changed, 128 insertions(+), 76 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index 089c38844..7152f6626 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -1,33 +1,87 @@ -$nicotine-dark: #acae89; -$nicotine-light: #f8f8ee; -$fg-dark: #000; -$fg-light: #fff; -$highlight-dark: #2f4858; -$highlight-light: #3F5F6F; -$highlight-light2: #eee; -$border-color: #ccc; -$border-color2: #aaa; -$heading-fonts: serif; -$visited: #fcc; +// $nicotine-dark: #acae89; +// $nicotine-light: #f8f8ee; +// $fg-dark: #000; +// $fg-light: #fff; +// $highlight-dark: #2f4858; +// $highlight-light: #3F5F6F; +// $highlight-light2: #eee; +// $border-color: #ccc; +// $border-color2: #aaa; +// $heading-fonts: serif; +// $visited: #fcc; + +:root { + --clr-bg: #fff; + --clr-text: #000; // $fg-dark + + --clr-bg-page: #f8f8ee; // $nicotine-light + + --clr-bg-theme: #3F5F6F; // $highlight-light + --clr-text-theme: #fff; // $fg-light + + --clr-bg-highlight: #eee; // $highlight-light2 + --clr-text-highlight: #111111; + + --clr-bg-accent: #acae89; // $nicotine-dark + + --clr-text-visited: #fcc; // $visited + + --clr-border: #ccc; // $border-color + --clr-border-dim: #aaa; // $border-color2 + --clr-border-dark: #888; + + --clr-shadow: var(--clr-border); + + --font-family: sans-serif; + --font-size: 14px; + --font-family-heading: serif; // $heading-fonts +} + +@media (prefers-color-scheme: dark) { + :root { + --clr-bg: #2f2f2f; + --clr-text: #ddd; + --clr-bg-page: #161616; + + --clr-bg-theme: #111111; + --clr-text-theme: var(--clr-text); + + --clr-bg-highlight: #606060; + --clr-text-highlight: #eee; + + --clr-bg-accent: var(--clr-bg); + + --clr-border: #666666; + --clr-border-dim: #444444; + --clr-border-dark: #222222; + + --clr-shadow: #000; + } +} * { box-sizing: border-box; } + +html { + color-scheme: light dark; +} + h1 a, h2 a { - color: $fg-light; + color: var(--clr-text-theme); } h1 a:visited, h2 a:visited { - color: $visited; + color: var(--clr-text-visited); } progress { width: 10ch; } body { - background-color: $nicotine-light; - color: $fg-dark; - font-family: sans-serif; - font-size: 14px; + background-color: var(--clr-bg-page); + color: var(--clr-text); + font-family: var(--font-family); + font-size: var(--font-size); line-height: 1.6; margin-left: auto; margin-right: auto; @@ -99,28 +153,28 @@ body { li { display: inline; padding: 1ch; - background-color: $highlight-light2; + background-color: var(--clr-bg-highlight); a { text-decoration: none; display: inline-block; - color: #000; + color: var(--clr-text-highlight); } } li.current { - background-color: $highlight-light; + background-color: var(--clr-bg-theme); a { - color: #fff; + color: var(--clr-text-theme); } } } } .dialog { - border: 1px solid $border-color2; - box-shadow: 0 0 1ch $border-color; - background-color: #fff; + border: 1px solid var(--clr-border-dim); + box-shadow: 0 0 1ch var(--clr-shadow); + background-color: var(--clr-bg); padding: 1ch; h2 { @@ -129,22 +183,22 @@ body { font-weight: normal; padding: 0.5ch; font-size: 12pt; - background-color: $highlight-light; - color: #fff; + background-color: var(--clr-bg-theme); + color: var(--clr-text-theme); } } header { - background-color: $nicotine-dark; - color: #fff; - border: 1px solid #888; - box-shadow: 0 0 0.5ch #888; + background-color: var(--clr-bg-accent); + color: var(--clr-text); + border: 1px solid var(--clr-border-dark); + box-shadow: 0 0 0.5ch var(--clr-border-dark); margin-bottom: 1ch; nav { a { text-decoration: none; - color: #000; + color: var(--clr-text); padding: .5ch; display: inline-block; @@ -160,8 +214,8 @@ header { } a:hover, a:focus { - background: #2f4858; - color: #fff !important; + background: var(--clr-bg-theme); + color: var(--clr-text-theme); } } } @@ -210,11 +264,11 @@ header { @extend .heading; } - background-color: #fff; + background-color: var(--clr-); padding: 1ch; margin: 1ch; - border: 1px solid $border-color2; - box-shadow: 0 0 1ch $border-color; + border: 1px solid var(--clr-border-dim); + box-shadow: 0 0 1ch var(--clr-shadow); } section.cards { @@ -276,7 +330,8 @@ section.cards { .positions { box-shadow: 0 0 2px #888; - background-color: #e4e4e4; + background-color: var(--clr-bg-highlight); + color: var(--clr-text-highlight); padding: 2px; margin-right: -1ch; margin-left: 1ch; @@ -297,13 +352,13 @@ footer { h1 { font-weight: normal; - border-bottom: 4px solid $highlight-light; + border-bottom: 4px solid var(--clr-bg-theme); } h2 { font-size: 14pt; font-weight: normal; - border-bottom: 2px solid $highlight-dark; + border-bottom: 2px solid var(--clr-bg-theme); width: 80%; } @@ -312,9 +367,9 @@ footer { flex-basis: 40ch; flex-grow: 1.1; - background-color: #fff; - border-left: 1px solid $border-color2; - box-shadow: -1px -1px 5px $border-color; + background-color: var(--clr-bg); + border-left: 1px solid var(--clr-border-dim); + box-shadow: -1px -1px 5px var(--clr-shadow); padding-left: 1ch; padding-right: 1ch; @@ -329,18 +384,18 @@ footer { } .shadowbox { - box-shadow: 0 0 1ch $border-color2; - border: 1px solid $border-color; + box-shadow: 0 0 1ch var(--clr-shadow); + border: 1px solid var(--clr-border); } .heading { margin: 0; padding: 0.5ch; - background-color: $highlight-light; - border-bottom: 1px solid $border-color2; - font-family: $heading-fonts; + background-color: var(--clr-bg-theme); + border-bottom: 1px solid var(--clr-border-dim); + font-family: var(--font-family-heading); font-weight: normal; - color: $fg-light; + color: var(--clr-text-theme); font-size: 12pt; word-break: break-word; } @@ -440,7 +495,7 @@ footer { @extend .shadowbox; padding: 0.5ch; - background-color: $fg-light; + background-color: var(--clr-bg); display: grid; grid-template-columns: max-content 0 auto max-content; grid-gap: 0.5ch; @@ -452,9 +507,9 @@ footer { padding: 0.5ch; font-size: 14pt; word-break: keep-all; - background-color: $highlight-light; - color: $fg-light; - font-family: $heading-fonts; + background-color: var(--clr-bg-theme); + color: var(--clr-text-theme); + font-family: var(--font-family-heading); font-weight: normal; border: 1px solid; text-align: center; @@ -469,16 +524,13 @@ footer { font-family: monospace; font-size: 12pt; padding: 0.5ch; - border: 1px solid $border-color2; - background-color: $fg-light; - color: $fg-dark; + border: 1px solid var(--clr-border-dim); } input[type="submit"] { font-size: 12pt; - border: 1px solid $border-color2; - background-color: $fg-light; - color: $fg-dark; + border: 1px solid var(--clr-border-dim); + background-color: var(--clr-bg); } .suggestions { @@ -528,22 +580,22 @@ footer { #filters { @extend .shadowbox; margin-top: 1ch; - background-color: $fg-light; + background-color: var(--clr-bg); h2 { @extend .heading; - background-color: $highlight-light; + background-color: var(--clr-bg-theme); } h3 { @extend .heading; - background-color: $highlight-light2; + background-color: var(--clr-bg-highlight); font-family: sans-serif; color: #000; border-bottom: 1px solid #000; } hr { - border-top: 0.5px solid $border-color2; + border-top: 0.5px solid var(--clr-border-dim); border-bottom: none; } ul { @@ -553,17 +605,17 @@ footer { li { padding: 1ch; a { - color: $fg-dark; + color: inherit; text-decoration: none; } a:hover, a:focus { - border-bottom: 1px solid $highlight-light; + border-bottom: 1px solid var(--clr-bg-theme); } } li.current { - border-left: 4px solid $highlight-light; - background-color: $highlight-light2; + border-left: 4px solid var(--clr-bg-theme); + background-color: var(--clr-bg-highlight); a { margin-left: -4px; } @@ -576,46 +628,46 @@ footer { margin: 1ch 0 2ch 0; .url { - background-color: $highlight-light; + background-color: var(--clr-bg-theme); padding-left: 0.5ch; a { word-break: break-all; font-family: monospace; font-size: 8pt; - color: $fg-light; + color: var(--clr-text-theme); text-shadow: 0 0 1ch #000; // guarantee decent contrast across background colors } a:visited { - color: $visited; + color: var(--clr-text-visited); } } h2 { a { word-break: break-all; - color: $fg-dark; + color: var(--clr-text); text-decoration: none; } font-size: 12pt; @extend .heading; - background-color: $highlight-light2; + background-color:var(--clr-bg-highlight); } .description { - background-color: $fg-light; + background-color: var(--clr-bg); word-break: break-word; padding: 1ch; margin: 0; } ul.additional-results { - background-color: $fg-light; + background-color: var(--clr-bg); padding: 1ch; list-style: none; margin: 0; a { - color: $fg-dark; + color: inherit; } } } @@ -631,7 +683,7 @@ footer { display: flex; font-size: 10pt; padding: 1ch; - background-color: #eee; + background-color: var(--clr-bg-highlight); > * { margin-right: 1ch; @@ -645,7 +697,7 @@ footer { padding-left: 4px; } a { - color: #000; + color: var(--clr-text-highlight); } } From bb315221ab83ae4d4e36098029de3ad0ffe8adf3 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Tue, 14 May 2024 01:32:40 -0500 Subject: [PATCH 02/24] (search, WIP) Make the dark theme look generally nicer. Rename CSS custom properties a bit. Switch a lot of background colors to HSL to make it easy to change colors relative to one another. --- .../resources/static/search/serp.scss | 115 ++++++++---------- 1 file changed, 52 insertions(+), 63 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index 7152f6626..2e6a89aa5 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -1,37 +1,24 @@ -// $nicotine-dark: #acae89; -// $nicotine-light: #f8f8ee; -// $fg-dark: #000; -// $fg-light: #fff; -// $highlight-dark: #2f4858; -// $highlight-light: #3F5F6F; -// $highlight-light2: #eee; -// $border-color: #ccc; -// $border-color2: #aaa; -// $heading-fonts: serif; -// $visited: #fcc; - :root { - --clr-bg: #fff; - --clr-text: #000; // $fg-dark + --clr-bg-page: hsl(60, 42%, 95%); // $nicotine-light - --clr-bg-page: #f8f8ee; // $nicotine-light + --clr-bg-ui: hsl(0, 0%, 100%); + --clr-text-ui: #000; // $fg-dark - --clr-bg-theme: #3F5F6F; // $highlight-light + --clr-bg-theme: hsl(200, 28%, 34%); // $highlight-light --clr-text-theme: #fff; // $fg-light - --clr-bg-highlight: #eee; // $highlight-light2 + --clr-bg-highlight: hsl(0, 0%, 93%); // $highlight-light2 --clr-text-highlight: #111111; - --clr-bg-accent: #acae89; // $nicotine-dark + --clr-bg-accent: hsl(63, 19%, 61%); // $nicotine-dark + --clr-border-accent: hsl(63, 19%, 35%); + + --clr-border: #aaa; // $border-color2 + + --clr-shadow: var(--clr-border); --clr-text-visited: #fcc; // $visited - --clr-border: #ccc; // $border-color - --clr-border-dim: #aaa; // $border-color2 - --clr-border-dark: #888; - - --clr-shadow: var(--clr-border); - --font-family: sans-serif; --font-size: 14px; --font-family-heading: serif; // $heading-fonts @@ -39,23 +26,24 @@ @media (prefers-color-scheme: dark) { :root { - --clr-bg: #2f2f2f; - --clr-text: #ddd; - --clr-bg-page: #161616; + --clr-bg-page: hsl(0, 0%, 6%); - --clr-bg-theme: #111111; - --clr-text-theme: var(--clr-text); + --clr-bg-ui: hsl(0, 0%, 18%); + --clr-text-ui: #ddd; - --clr-bg-highlight: #606060; - --clr-text-highlight: #eee; + --clr-bg-theme: hsl(0, 0%, 2%); + --clr-text-theme: var(--clr-text-ui); - --clr-bg-accent: var(--clr-bg); + --clr-bg-highlight: hsl(0, 0%, 11%); + --clr-text-highlight: #fff; - --clr-border: #666666; - --clr-border-dim: #444444; - --clr-border-dark: #222222; + --clr-bg-accent: hsl(200, 32%, 28%); + --clr-border-accent: hsl(200, 8%, 12%); + + --clr-border: hsl(0, 0%, 30%); --clr-shadow: #000; + } } @@ -79,7 +67,7 @@ progress { body { background-color: var(--clr-bg-page); - color: var(--clr-text); + color: var(--clr-text-ui); font-family: var(--font-family); font-size: var(--font-size); line-height: 1.6; @@ -172,9 +160,9 @@ body { } .dialog { - border: 1px solid var(--clr-border-dim); + border: 1px solid var(--clr-border); box-shadow: 0 0 1ch var(--clr-shadow); - background-color: var(--clr-bg); + background-color: var(--clr-bg-ui); padding: 1ch; h2 { @@ -190,15 +178,15 @@ body { header { background-color: var(--clr-bg-accent); - color: var(--clr-text); - border: 1px solid var(--clr-border-dark); - box-shadow: 0 0 0.5ch var(--clr-border-dark); + border: 1px solid var(--clr-border-accent); + color: var(--clr-text-ui); + box-shadow: 0 0 0.5ch var(--clr-shadow); margin-bottom: 1ch; nav { a { text-decoration: none; - color: var(--clr-text); + color: var(--clr-text-ui); padding: .5ch; display: inline-block; @@ -206,9 +194,9 @@ header { a.extra { background: #ccc linear-gradient(45deg, - rgba(255,100,100,1) 0%, - rgba(100,255,100,1) 50%, - rgba(100,100,255,1) 100%); + hsl(0, 100%, 70%) 0%, + hsl(120, 100%, 70%) 50%, + hsl(240, 100%, 70%) 100%); color: black; text-shadow: 0 0 0.5ch #fff; } @@ -267,7 +255,7 @@ header { background-color: var(--clr-); padding: 1ch; margin: 1ch; - border: 1px solid var(--clr-border-dim); + border: 1px solid var(--clr-border); box-shadow: 0 0 1ch var(--clr-shadow); } @@ -281,7 +269,7 @@ section.cards { .card { border: 2px #ccc; - background-color: #fff; + background-color: var(--clr-ui); border-left: 1px solid #ecb; border-top: 1px solid #ecb; box-shadow: #0008 0 0 5px; @@ -330,7 +318,7 @@ section.cards { .positions { box-shadow: 0 0 2px #888; - background-color: var(--clr-bg-highlight); + backdrop-filter: brightness(90%); color: var(--clr-text-highlight); padding: 2px; margin-right: -1ch; @@ -367,8 +355,8 @@ footer { flex-basis: 40ch; flex-grow: 1.1; - background-color: var(--clr-bg); - border-left: 1px solid var(--clr-border-dim); + background-color: var(--clr-bg-ui); + border-left: 1px solid var(--clr-border); box-shadow: -1px -1px 5px var(--clr-shadow); padding-left: 1ch; @@ -392,7 +380,7 @@ footer { margin: 0; padding: 0.5ch; background-color: var(--clr-bg-theme); - border-bottom: 1px solid var(--clr-border-dim); + border-bottom: 1px solid var(--clr-border); font-family: var(--font-family-heading); font-weight: normal; color: var(--clr-text-theme); @@ -495,7 +483,7 @@ footer { @extend .shadowbox; padding: 0.5ch; - background-color: var(--clr-bg); + background-color: var(--clr-bg-ui); display: grid; grid-template-columns: max-content 0 auto max-content; grid-gap: 0.5ch; @@ -511,7 +499,6 @@ footer { color: var(--clr-text-theme); font-family: var(--font-family-heading); font-weight: normal; - border: 1px solid; text-align: center; } @@ -524,15 +511,17 @@ footer { font-family: monospace; font-size: 12pt; padding: 0.5ch; - border: 1px solid var(--clr-border-dim); + border: 1px solid var(--clr-border); + background-color: inherit; } input[type="submit"] { font-size: 12pt; - border: 1px solid var(--clr-border-dim); - background-color: var(--clr-bg); + border: 1px solid var(--clr-border); + background-color: var(--clr-bg-ui); } + // white suggesitons looks fine in dark mode .suggestions { background-color: #fff; padding: .5ch; @@ -543,7 +532,7 @@ footer { width: 300px; border-left: 1px solid #ccc; border-top: 1px solid #ccc; - box-shadow: 5px 5px 5px #888; + box-shadow: 5px 5px 5px var(--clr-shadow); z-index: 10; a { @@ -580,7 +569,7 @@ footer { #filters { @extend .shadowbox; margin-top: 1ch; - background-color: var(--clr-bg); + background-color: var(--clr-bg-ui); h2 { @extend .heading; @@ -589,13 +578,13 @@ footer { h3 { @extend .heading; background-color: var(--clr-bg-highlight); + color: var(--clr-text-highlight); font-family: sans-serif; - color: #000; border-bottom: 1px solid #000; } hr { - border-top: 0.5px solid var(--clr-border-dim); + border-top: 0.5px solid var(--clr-border); border-bottom: none; } ul { @@ -646,7 +635,7 @@ footer { h2 { a { word-break: break-all; - color: var(--clr-text); + color: var(--clr-text-ui); text-decoration: none; } font-size: 12pt; @@ -655,14 +644,14 @@ footer { } .description { - background-color: var(--clr-bg); + background-color: var(--clr-bg-ui); word-break: break-word; padding: 1ch; margin: 0; } ul.additional-results { - background-color: var(--clr-bg); + background-color: var(--clr-bg-ui); padding: 1ch; list-style: none; margin: 0; From 43489c98d87d41f984b9b0bd19e4ccb09a14913c Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Sun, 19 May 2024 01:06:54 -0500 Subject: [PATCH 03/24] (search) Minor dark theme tweaks after the new mocked UI elements were added --- .../resources/static/search/serp.scss | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index 2e6a89aa5..72fed2ff9 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -252,7 +252,7 @@ header { @extend .heading; } - background-color: var(--clr-); + background-color: var(--clr-bg-ui); padding: 1ch; margin: 1ch; border: 1px solid var(--clr-border); @@ -268,11 +268,10 @@ section.cards { justify-content: flex-start; .card { - border: 2px #ccc; - background-color: var(--clr-ui); + background-color: var(--clr-bg-ui); border-left: 1px solid #ecb; border-top: 1px solid #ecb; - box-shadow: #0008 0 0 5px; + box-shadow: var(--clr-shadow) 0 0 5px; h2 { @extend .heading; @@ -281,7 +280,7 @@ section.cards { h2 a { display: block !important; - color: #fff; + color: inherit; text-decoration: none; } a:focus img { @@ -313,11 +312,17 @@ section.cards { padding-right: 1ch; line-height: 1.6; } + + @media (prefers-color-scheme: dark) { + & { + border: 1px solid var(--clr-border); + } + } } } .positions { - box-shadow: 0 0 2px #888; + box-shadow: 0 0 2px var(--clr-shadow); backdrop-filter: brightness(90%); color: var(--clr-text-highlight); padding: 2px; From 5659df43885649f1969148dc0cd5b15f5ee676c5 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Tue, 21 May 2024 00:03:46 -0500 Subject: [PATCH 04/24] (search) Set link and form field colors manually to override browser defaults with poor dark mode contrast --- .../resources/static/search/serp.scss | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index 72fed2ff9..b2ca3a19d 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -17,7 +17,9 @@ --clr-shadow: var(--clr-border); - --clr-text-visited: #fcc; // $visited + --clr-link: #0066cc; + --clr-link-visited: #531a89; + --clr-heading-link-visited: #fcc; // $visited --font-family: sans-serif; --font-size: 14px; @@ -43,7 +45,10 @@ --clr-border: hsl(0, 0%, 30%); --clr-shadow: #000; - + + --clr-link: #8a8aff; + --clr-link-visited: #ffadff; + --clr-heading-link-visited: var(--clr-link-visited); } } @@ -55,11 +60,23 @@ html { color-scheme: light dark; } +a { + color: var(--clr-link); +} + +a:visited { + color: var(--clr-link-visited); +} + +input, textarea, select { + color: inherit; +} + h1 a, h2 a { color: var(--clr-text-theme); } h1 a:visited, h2 a:visited { - color: var(--clr-text-visited); + color: var(--clr-heading-link-visited); } progress { width: 10ch; @@ -187,11 +204,14 @@ header { a { text-decoration: none; color: var(--clr-text-ui); - padding: .5ch; display: inline-block; } + a:visited { + color: var(--clr-text-ui); + } + a.extra { background: #ccc linear-gradient(45deg, hsl(0, 100%, 70%) 0%, @@ -524,6 +544,7 @@ footer { font-size: 12pt; border: 1px solid var(--clr-border); background-color: var(--clr-bg-ui); + cursor: pointer; } // white suggesitons looks fine in dark mode @@ -633,7 +654,7 @@ footer { text-shadow: 0 0 1ch #000; // guarantee decent contrast across background colors } a:visited { - color: var(--clr-text-visited); + color: var(--clr-heading-link-visited); } } From b4eac2516eaf5a646dfd27a9755aca6db23db893 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 2 Jun 2024 16:30:34 +0200 Subject: [PATCH 05/24] (crawler) Send "Accept"-headers when fetching documents, also indicate we prefer English results --- .../nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 7980f3a7d..1df0301bb 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -183,6 +183,8 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") + .addHeader("Accept-Language", "en,*;q=0.5") + .addHeader("Accept", "text/html, application/xhtml+xml, */*;q=0.8") .addHeader("User-agent", userAgentString); contentTags.paint(getBuilder); @@ -225,6 +227,7 @@ public class HttpFetcherImpl implements HttpFetcher { getBuilder.url(url.toString()) .addHeader("Accept-Encoding", "gzip") + .addHeader("Accept", "text/*, */*;q=0.9") .addHeader("User-agent", userAgentString); HttpFetchResult result = recorder.fetch(client, getBuilder.build()); From e2f68d9ccfefe0b43efc2e28b2a659f02fe5e5c0 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Sun, 2 Jun 2024 21:02:52 -0500 Subject: [PATCH 06/24] Add a theme select to the header that lets users toggle their theme independent of their OS theme --- code/libraries/array/cpp/resources/libcpp.so | Bin 0 -> 16040 bytes .../resources/static/search/serp.scss | 56 +++++++++-------- .../resources/static/search/theme.js | 57 ++++++++++++++++++ .../templates/search/parts/search-header.hdb | 11 ++++ 4 files changed, 99 insertions(+), 25 deletions(-) create mode 100755 code/libraries/array/cpp/resources/libcpp.so create mode 100644 code/services-application/search-service/resources/static/search/theme.js diff --git a/code/libraries/array/cpp/resources/libcpp.so b/code/libraries/array/cpp/resources/libcpp.so new file mode 100755 index 0000000000000000000000000000000000000000..9d6d9616786258903a462cbf295a9cdf0150d75c GIT binary patch literal 16040 zcmeHOeQZ=$cE4kf!+`0`@R8sKwoItAvNxLv<}1tsVa8wFM;?S?J0&667h@0DySB&j zj0vRvATl7mOonyTRvT4Sg|zvD{!wkDZnbKZWNpk>X_n$-!$O)(NDx^&hOB{Xut{M0 zJMY~y9#8C5wtrMxX|6SM?)kjuo^#%P_l@uCuFxxU91g)PS3DvnFd>49(eN-!5N)-{ z$M?6yB1_ge!TMTnTb@l~D7-dKV^}POGo4{L-g z+kEdMlVO3<^C`%Bgp!_5>G_l%!#))(hA!Kzi;+6ZBcreab&z&jmMD3`zz>CT)t`JG;dYoO_zsJ6;Fl=wc<%OsAe{7t`P`N zANi829H9Pug%kf|v-VFluO^=St#ifxo(ICdBX8aF6p3;^$u9XHv`D^v)SrRR$%h`U zYTsRS2;C_g`QV#^L-wYkKLh$)d@|d2LGWb~fmhIwEnrx{DcNXtHFop zhZk6=IIifzFm^RZT8wC{wk>8D!qBQW8TH||@UF&aEZkPTsjMl|60WY@(G<3MQ*(^E zJ+;Qp#+KTq#^=JKIch{BFt=)jXbv|wM|Mw;N|vn_O^rM1{LzSimDr$brDeu4|1$qd zxaGiKj{2~BM2msPb~}T-*$8AKkc~h#0@(;;Ban?i zHUfWo5xC*`i(9hiic7}xE<7ZJJkS@*NuHHG$6N!Jv&oeogM2-C|0Mv=VnOosEF!KC zf0RrndkF{kUo399u=v2=U|c5F^`kEnd9M)&jK%K8P&=uQp=Ks8dKT{^a{~%O{v}I( z*i8d znwMj7tFPyhLl2zmC>rj^z`ZhgTAF8N|3#;Cj2*uf z@yh;BoX2l<&wp48yxsMXG|$PzW`|77pDFu4%99DFM~^Sn12NBN_c4eUDDg!k-aRM} z{I_TS^E9Ev#-udQSdnp@K~(g>>Fx(cN_(L*QzjlyVhEQt@X&hcI4cLoqy8U@ za9jokJqN-FoJ<~-<{-@mzSMh0Wukg6x#LTXH8#8sa<(05o|+gDd9II{0gSwSsWBHc}gZ`pOo=A(wu)&3k-C9Tbd^j0mRff zSN4DG1bcxVpQ8t=7mRhE)orQn_vC@Au{N35f-s)5!+6dLV_&y_WbIDBXYqcFK$SCx z!kH0D%y3i-jCM9cSx;cx_+0V=N$5DEn=`jlNh$rr?3);>eONd9EPoLOw#e5 z9K0n1XFUgE6t_gV$dU2tlVda|8g}-{;q6u>A}+c)XdB)+;hSX-ktrUN>*(Rub7(Fe zWkWTHmS(;is@4;`yfxCiU1eS12j2GVuYeS;q&&MGhmfEOA>-{{8Fy*%J)>Ikm&ZOD z?>Z_2XFQ3Y)9eG|p2UBm14$q96h@~T8Z}-m-a3jY>Eo{KJl|2Jq ziICioJwxbR>&)NTYj<7UH)UWT`W=kca{Ab%YfH*p4h&&wV1%ksdzg6=ql~;l^VH2x zb>!9jaA+3A$~-TNhj`ZGaBvgS*{qr8hrdPaz@5%VV6WK=I~DZREXd*EjW*Mh7=+5P z{^ugCsH=0}w-eSK9HprZ7g|!|9Us8NYA`z|&3ZTwY~36{9G!Zi;f^+VTQiU8=8zTC zxSl8UczK>nH*v9iCe7as{ri{6BqI96 z+|?)pqn;Ok2IDety(3IPon1(xs1<3BV19wqU0$rWGXAIw_2ELzSoMLUCym-DApIsT)C1i7KdLp6Hhy9+|qHB~aP=wZ6BwZLyY`#-0M7hlIRrJ28?d*y5Nd({o; zd84oRsH}TiI^MRb@x!4nZ)2Rq4_(^xU+sg4=g?_<>xrKb5_*$Rw0+<*G5 zRK~sFbX>J{1JG?hwkdEqRz}yg7N`2CE*#Sm<%MH(6G{9te88f@>o3_kq3lq#T-8wV z(saW_JM44yhrS;&26pypfvd4kL$6OWkH4gOJaLn*-{s^ENx!bg+cA&j>GL?(xt`at zjCq{vxT>2aBS+ak-TY+2AJ2gYQ7pr|tV$amIVa8G;k$sOU4OJc!JdNinVC&Q3fY`V zmt9&mozrIX_PCWzFS2Rh0p3UOkWE5DZ_;n0Bg^gjLp~Qx%IER%ZfD_;{zWp0Eer~? z=Qyn$xs=*+tRxk}z7?#Q*D-+Pj>c)-1ialNh$#8upcKD}6npIj4!aU7zgI)A zTQe(BbviLG%C9}I*zeW6Xinr8rTS-S^9s!CPR*;cG*xeqD6sw6HZd>qW#l!>U)}sD z%{$M5MU?O-RKkBv3BQRI*WL$Px82lE8Z*weYy`3q$VT8#C<2a!xofK75B~Wke51)^ zD_{`tWxy?fuM+)QGIEro^?-+QLwyCX6*nXPzwLa;QFA=E zRXFx|9SdjAaP>l_5VS@39Qhq=01CIa(p|98lmD!1pICqI_r9~da0!t~AGaBC7;?1O zb0a$aBlqPDdJ$+be8}e?VjNNG_8!WS?t-4&oGY{3-cq+q=I1n69g;na&);EQ_mQl0 zdk^Mpwq#$0Y{>1>@^c=uIzh1C!slho{eJpKVYStp6dy)*-T$$gOnyUUWj?Ald4zw}=^Q7u0)@}AQSpL;{BTmB@YI>ipiNL_ z1@q~7k{X@;rKXP2il7>p=`TBNKKpsZMy=;&>asoj_-FaQN6F)tK?~z?VmE%~{rI%fLcqcfHNuYgW=eCnLC^&13e^g9XBd(|Ou>-PrG zql(@)4+xs|+XCQ4MIXJ}A`9#H0(5B}Q;oX}1?>=X#EK=heERoP@lw^c*oOwVp8d$DsQd zNrV3i`c&uOe=&{x86}^2KK_D5Pcy8#SEsQv0~hF$uZs&&*A|P$+VLNrIx%s)-iS3D zb#%Z!igWn&5o1?VWJhh2Q6G!6MUC3_J)$ns+}ae5h3oxGroE$4dL2W zZOh8mRV(%^YpPmmY!2XEftGe0_20A7Fe0tdl9h&rcLWS_(r83ad$Od2&;H|}e??8x z)<>aOwaTb4Dk>^=R~yy9LPm9oQN7IY*F{>QvDUUoD@|^yv74HjrX92?9F1CortGd{ zyN%JdS~j`Lz_ep+5$k<~rbuLBM$=mUBc@Xopc!F0w*ONM_KjyywL+Qlv-?Kn*3c#d zZgV-kmqB@^5{O2OhT4{TdL5(uiOr!+S{bN#lf|fzN#mkhWcPuVa1QS5<*V zdnDH9{UE~^z)NprBfV`0LVGdR=lvH$-d?c(EXS|``n0!WocDJOeQK~w{}ow0iw@al zecn$pJfZ}7{LJ}3uk=fm9PdvVa-2vXciFGSFAf^Tg!j*>7q8UBd7lmutFa%9A4Dfz z->39akOdXwcrtt`LqDh(3|WrNFwXEFGW54721AycPX8wv`p+l@hF@R{lHM}y(V>*| z`SbpXAwLJPjm-G{ROxg6g-=N;SglY7nfh;lLF2RjHlHoY@FFqO8rKh#PNGA5X}X4} z@%wfB9%iOI;|$+{jt3vs=Y4bmEi7m(XJZpM(?0}*YLE4KKgsXa_9_4Qd5`rt547(l z5!UDT`Yt{MBAU&YhC^Vb>x-b0U1~$0LK(0g!)suq>-&PXq)+LyT&6zbH_;&( z_Mi91MbqfhSn2I6AVgE*DHiKLkfPWf)=!6AJL#g^l))jzq;-JS5}psQEA;ivIyWCgm6a literal 0 HcmV?d00001 diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index b2ca3a19d..fe5f231cb 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -1,4 +1,6 @@ :root { + color-scheme: light; + --clr-bg-page: hsl(60, 42%, 95%); // $nicotine-light --clr-bg-ui: hsl(0, 0%, 100%); @@ -26,40 +28,39 @@ --font-family-heading: serif; // $heading-fonts } -@media (prefers-color-scheme: dark) { - :root { - --clr-bg-page: hsl(0, 0%, 6%); + +:root[data-theme='dark'] { - --clr-bg-ui: hsl(0, 0%, 18%); - --clr-text-ui: #ddd; + color-scheme: dark; - --clr-bg-theme: hsl(0, 0%, 2%); - --clr-text-theme: var(--clr-text-ui); + --clr-bg-page: hsl(0, 0%, 6%); - --clr-bg-highlight: hsl(0, 0%, 11%); - --clr-text-highlight: #fff; + --clr-bg-ui: hsl(0, 0%, 18%); + --clr-text-ui: #ddd; - --clr-bg-accent: hsl(200, 32%, 28%); - --clr-border-accent: hsl(200, 8%, 12%); + --clr-bg-theme: hsl(0, 0%, 2%); + --clr-text-theme: var(--clr-text-ui); - --clr-border: hsl(0, 0%, 30%); + --clr-bg-highlight: hsl(0, 0%, 11%); + --clr-text-highlight: #fff; - --clr-shadow: #000; + --clr-bg-accent: hsl(200, 32%, 28%); + --clr-border-accent: hsl(200, 8%, 12%); - --clr-link: #8a8aff; - --clr-link-visited: #ffadff; - --clr-heading-link-visited: var(--clr-link-visited); - } + --clr-border: hsl(0, 0%, 30%); + + --clr-shadow: #000; + + --clr-link: #8a8aff; + --clr-link-visited: #ffadff; + --clr-heading-link-visited: var(--clr-link-visited); } + * { box-sizing: border-box; } -html { - color-scheme: light dark; -} - a { color: var(--clr-link); } @@ -199,6 +200,8 @@ header { color: var(--clr-text-ui); box-shadow: 0 0 0.5ch var(--clr-shadow); margin-bottom: 1ch; + display: flex; + align-items: center; nav { a { @@ -228,6 +231,11 @@ header { } } +#theme { + margin-left: auto; + margin-right: .5ch; +} + #complaint { @extend .dialog; max-width: 60ch; @@ -333,10 +341,8 @@ section.cards { line-height: 1.6; } - @media (prefers-color-scheme: dark) { - & { - border: 1px solid var(--clr-border); - } + [data-theme='dark'] & { + border: 1px solid var(--clr-border); } } } diff --git a/code/services-application/search-service/resources/static/search/theme.js b/code/services-application/search-service/resources/static/search/theme.js new file mode 100644 index 000000000..73fdcd263 --- /dev/null +++ b/code/services-application/search-service/resources/static/search/theme.js @@ -0,0 +1,57 @@ +function getTheme() { + const theme = window.localStorage.getItem('theme'); + + // if a valid theme is set in localStorage, return it + if (theme === 'dark' || theme === 'light') { + return { value: theme, system: false }; + } + + // if matchMedia is supported and OS theme is dark + if (window.matchMedia('(prefers-color-scheme: dark)').matches) { + return { value: 'dark', system: true }; + } + + return { value: 'light', system: true }; +} + +function setTheme(value) { + if (value === 'dark' || value === 'light') { + window.localStorage.setItem('theme', value); + } else { + window.localStorage.removeItem('theme'); + } + + const theme = getTheme(); + + document.documentElement.setAttribute('data-theme', theme.value); +} + +function initializeTheme() { + const themeSelect = document.getElementById('theme-select'); + + const theme = getTheme(); + + document.documentElement.setAttribute('data-theme', theme.value); + + // system is selected by default in the themeSwitcher so ignore it here + if (!theme.system) { + themeSelect.value = theme.value; + } + + themeSelect.addEventListener('change', e => { + setTheme(e.target.value); + }); + + const mql = window.matchMedia('(prefers-color-scheme: dark)'); + + // if someone changes their theme at the OS level we need to update + // their theme immediately if they're using their OS theme + mql.addEventListener('change', e => { + if (themeSelect.value !== 'system') return; + + if (e.matches) setTheme('dark'); + else setTheme('light'); + }); +} + +initializeTheme(); \ No newline at end of file diff --git a/code/services-application/search-service/resources/templates/search/parts/search-header.hdb b/code/services-application/search-service/resources/templates/search/parts/search-header.hdb index c9ad0daba..805ea8a9b 100644 --- a/code/services-application/search-service/resources/templates/search/parts/search-header.hdb +++ b/code/services-application/search-service/resources/templates/search/parts/search-header.hdb @@ -7,4 +7,15 @@ Donate Random +
+ + +
+ + + \ No newline at end of file From fb2eef24d6c34dd91114286ca33c6eb1a3b3cf73 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Mon, 3 Jun 2024 14:15:35 -0500 Subject: [PATCH 07/24] Handle themeing when javascript is disabled. Hide the theme select and fallback to dark media query instead of data-theme attribute --- .../resources/static/search/serp.scss | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index fe5f231cb..a6e3ef24c 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -28,9 +28,8 @@ --font-family-heading: serif; // $heading-fonts } - -:root[data-theme='dark'] { +@mixin dark-theme-mixin { color-scheme: dark; --clr-bg-page: hsl(0, 0%, 6%); @@ -55,7 +54,17 @@ --clr-link-visited: #ffadff; --clr-heading-link-visited: var(--clr-link-visited); } + +:root[data-theme='dark'] { + @include dark-theme-mixin; +} +// Makes theme match the user's OS preference when JS is disabled +@media (prefers-color-scheme: dark) { + :root:not(:has([data-has-js="true"])) { + @include dark-theme-mixin; + } +} * { box-sizing: border-box; @@ -234,6 +243,11 @@ header { #theme { margin-left: auto; margin-right: .5ch; + display: none; + + [data-has-js='true'] & { + display: block; + } } #complaint { From 2d076cbd6720feb9454eac247865c598a30eab32 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Wed, 5 Jun 2024 18:20:33 -0500 Subject: [PATCH 08/24] (search) move data-has-js attribute from body to html element --- .../search-service/resources/static/search/main.js | 4 ++-- .../search-service/resources/static/search/serp.scss | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/main.js b/code/services-application/search-service/resources/static/search/main.js index 618533b78..a6bd3157c 100644 --- a/code/services-application/search-service/resources/static/search/main.js +++ b/code/services-application/search-service/resources/static/search/main.js @@ -1,6 +1,6 @@ -// This sets the data-has-js attribute on the body tag to true, so we can style the page with the assumption that +// This sets the data-has-js attribute on the html tag to true, so we can style the page with the assumption that // the browser supports JS. This is a progressive enhancement, so the page will still work without JS. -document.getElementsByTagName('body')[0].setAttribute('data-has-js', 'true'); +document.documentElement.setAttribute('data-has-js', 'true'); // To prevent the filter menu from being opened when the user hits enter on the search box, we need to add a keydown // handler to the search box that stops the event from propagating. Janky hack, but it works. diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index a6e3ef24c..85f59b34d 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -61,7 +61,7 @@ // Makes theme match the user's OS preference when JS is disabled @media (prefers-color-scheme: dark) { - :root:not(:has([data-has-js="true"])) { + :root:not([data-has-js="true"]) { @include dark-theme-mixin; } } @@ -737,7 +737,7 @@ footer { } @media (max-device-width: 624px) { - body[data-has-js="true"] { // This property is set via js so we can selectively enable these changes only if JS is enabled; + [data-has-js="true"] body { // This property is set via js so we can selectively enable these changes only if JS is enabled; // This is desirable since mobile navigation is JS-driven. If JS is disabled, having a squished // GUI is better than having no working UI. margin: 0 !important; From 9c06f446fbbc11a5aa0aa131b49b0ca408901e63 Mon Sep 17 00:00:00 2001 From: Sam Storment Date: Wed, 5 Jun 2024 19:55:17 -0500 Subject: [PATCH 09/24] (search) Styling tweaks. Make the filter button near the top right corener a bit bigger so it's easier to press on mobile --- .../search-service/resources/static/search/serp.scss | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/code/services-application/search-service/resources/static/search/serp.scss b/code/services-application/search-service/resources/static/search/serp.scss index 85f59b34d..3e25e7801 100644 --- a/code/services-application/search-service/resources/static/search/serp.scss +++ b/code/services-application/search-service/resources/static/search/serp.scss @@ -211,6 +211,7 @@ header { margin-bottom: 1ch; display: flex; align-items: center; + justify-content: space-between; nav { a { @@ -241,8 +242,7 @@ header { } #theme { - margin-left: auto; - margin-right: .5ch; + padding: .5ch; display: none; [data-has-js='true'] & { @@ -545,6 +545,8 @@ footer { font-family: var(--font-family-heading); font-weight: normal; text-align: center; + display: flex; + justify-content: space-between; } #suggestions-anchor { @@ -753,6 +755,8 @@ footer { #mcfeast { display: inline; float: right; + width: 2rem; + font-size: 1rem; } #menu-close { From a07cf1ba93c852df40ac4fe2889a4e4811326a42 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 6 Jun 2024 13:05:59 +0200 Subject: [PATCH 10/24] (array/cpp) Update gitignore to properly exclude libcpp.so --- code/libraries/array/cpp/resources/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/libraries/array/cpp/resources/.gitignore b/code/libraries/array/cpp/resources/.gitignore index 01b073459..82ac343f9 100644 --- a/code/libraries/array/cpp/resources/.gitignore +++ b/code/libraries/array/cpp/resources/.gitignore @@ -1 +1 @@ -../../resources/cpp/libcpp.so \ No newline at end of file +libcpp.so \ No newline at end of file From e0459d0c0d1b7ad217a4d8cc75029d8fe518c7bf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jun 2024 08:57:22 +0200 Subject: [PATCH 11/24] (build) Upgrade parquet dependencies to 1.14.0 This gets rid of a vulnerable transitive dependency. --- code/libraries/array/cpp/resources/libcpp.so | Bin 16040 -> 15600 bytes settings.gradle | 4 ++-- third-party/parquet-floor/build.gradle | 2 +- .../org/apache/hadoop/conf/Configuration.java | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/code/libraries/array/cpp/resources/libcpp.so b/code/libraries/array/cpp/resources/libcpp.so index 9d6d9616786258903a462cbf295a9cdf0150d75c..b247f7981b36614f7e67d889cfd190fa3d3298f6 100755 GIT binary patch literal 15600 zcmeHOeQ;FQb$_c}nFYb_f^krdr|b}=MU7b`839{N(MnplYn})}lG}vRS0M?cj=rp2 zVQ^&$M~LLjY6)Uc14vmXb`#o)@rDH0+NM6QiusQK(RjUl^KoAC_a&a>$lGM@P4cz(NI%Yd;Q8_zJE?;doW zU2FHB-7*^r#kWgA=A$O@T~d6P6d%K+au!3kEmrDCdOoM*Wx^sw$1s!clH$vR1&WWM z^aXI$@5CDZJLBzAe3^dawywrw$m4wxeB_r;TR6nVEryqk+&TQXV zi@b9Tk}ki2AC(-{R_P|`mjmy_kLZglKKt&;g(tPGE1!Ml@wK&kqni(3v-#}!(l}&K z8Hv*TN2tC4zZchStnNCv<~XWzmTkbUo};H0xMg$T$q)WH@HGBh_y^~~Hvzv)Tda-U zj|vLk!h2CC{E2*t)=HH>HN!a2+TLbFV|AS|!_bUKO_kBm*x7iXIT~y1tf{JKX>V(+ zsoU4mX!p(CW7Hq2Gxj&P)wMJ~(WtdHwzjq(1dJNdc8J}uK2OcSa zK(sxuftszU+JX9dBTB7e!q?mQ?RuoV!YB!p1WHN!9Exh{&$fr!-$a%3sq6mcg+%#X zyIbS?n(rrVsogQA?pww`uLQWu?0&{``|MU5PkBJG;WCznr`+OlDGkr}2=!e_!*gy* zo-%orImRKfG1+U@16dDbJ&^T4)&p4&WId4eK-L5QcOJOu{mna~@0wf0b6;AgX`+8H zmSc^Hz7hAZ9Mmd32ekFMmvQy3DnpIhwENH|uUM8P5?dr+BKI`02d88ALDWI2L)1d+ zig(o!A|6%{^xvS)MCqRqD7-7oq!@aCs~DPg3dg7zzZ+W)0bwP;ZC%;#T~#^5Px>C& zQj2b_>!DiFwM;g4G2f|Zapu~j(6g!zFOpN<4?coT7H z{-_{L(fzi|9_}_q)PqaA|7~RGcH6m;;Tx+59VBDV-q0VV`#|(f+-I-Z>xNJGi+N@7^+?QbK&a>&bK z=sgU1S&ZKi!HeGh8uaMqxv8(EF?&}bQOYff20}3$L8>4c=1UZiS`MMaL;cEa}*wg7hTK5(7(ITy+V&K)q^!FrhCV9o2vIR1!JE`>_RXu z+QGOegE80}n0k07XN0+w!jTq=@o8bgQ^`m$`D7iK^~Ca9q(UUV4592kubT_^>_z(9 z`8S90r@OueFKam>Uik)OLD6#DR{9&$ldYNVEz@*!oCO|%$?(tOTTs308@Av}c7j|& zc8K6;SD$VUPW@GyM^A|0E$`7mFotSFH4*cp+G>$7e8T)CCV_kre9L>Zn>tU@_IPR| z<@aS#_@3yy)#Ra194dM97#QPsx)|uPwX!i~KkQ-H6B>3al7UW9a z$lJGj&WX*nTK7qjs7Q*<)yXHz_n(k=fjK1lCWyYwFE;zSr%+o+62ROwjJYQtS^9?| za+=olU2udoP&lWap?omkB zu$l}Un$E344jwH6krXQLlz5jfV!a|96Onj#K~g04`b6AK%AXd&^WMY>_(243dj}o^ zmk8eV4iEZbh1;Rrx+96PAi5(d;`dHHE7dqmcM=p!T!KC$Lr=8p(B8ZV z!YXR^R-zfN^-VU;&LQ1=Pd9%i%%4o0z6$~7B~duR-s^`X_%34phnnOuS!Bi#Ms-^H?g@ zF2vd?WBt=vv5xCMh5!}qiN{oY-$}*S-Ln><#o8eZH>&WaN7flJ5!av{S9kA)R9uA_ zh6O7SQ7~iDpcqX46Gas>v=dKBrwD%FJ$jwO`TLVXxa5E@~La?Kqv zUlWOzjc!^zc#rx(eN%W(4gL$gb3=;ird4=a)W0emuTH%7OUsHp^$RlGdwd$#s>I(? zBXo-1{7sdn=4ad=g;){YaeI%`Ew7u`b@Q&A?ygT*Kc@^my#e68rzYUFXYEa9C3{# zDiK^fj1w|p;8JVbIP2T3-{y&xaquD7dlknc;DM^BA;iLg(pjH`D)i z>3QH!S{Ci>eK#zNo(13jz_JDblQ%6Z2{`znWv$0FIEcqWH{c1(OCx||c)&lX{)t8Z zbEV>VVz=fv9(Nznw%)(_cOEKOO=$Xm zXC;1zpvzalr)~52j^~6t`Ny2wJ^nsd&V@yu{E)|2?s4mMM1bgH_+5g|l|&!*_>SdN zdh+|6Ij0tRd?AlJl9#jHt_eD|plN@N-!yDH2dM4v_!2qgp8Nr4rN@8F74od<%iZBA z%P|*u)`UEMj9e~9tIYe7$2~my@+-D}6JR|GzuciQYe>hiCqLl~d;9~guxHJ&+)7VT z-#yNMaM)rnfGsAJx5pN9-j+j~k%vFBtg&A$=c}KTqtt<%2B>fd{3D2q{C{t?$M9}*7$DeSOd)5r(hCD^b?g@D|^)1-xDRZuXyiGvC+;WdUO}m`$DfNC3@|5*0ICjrK zZo+lk`E?3!8GctV&a;TS{C_jX8L(p$k+J#nRBS?d)r*Eb`P)a~yO%xgs=PmzVj&<~ zvL48KAnSpw2eKZ>dLZk8tOv3l$a>&^?g4eqkC-ST-$g_iqBqEN?DH9mJqdp|0)&!qCvU8`MIobqa)1xIy4wdvd? zl{Kop$Zl%iQ&1fb*VHjrdCr%*Ce(RL+OJUIe%_bTnO-U@Y@i)dhcX#I=CV6j&*u~! z4w=fr^6&yu>i@i=$6H9*WZDo#<_WuX?yn5z{8+DWcldfpYXlktMi0mru*s%UkNlSH+u``Hs=xtp%Mt-Q&Fxoq!#id3B=g*D$_SO!g z8MVlli;MX@IS!GBYg=~j0AuwABW#4j;e$0s4O$_irr4+{F#^re&bmO++{3oCw9GqR zOJg)D$4iMUolE2f!)R`cHg?9E>FhktxU(_y3jQ?=sSc7YvrLfV#X8&Nar&0__8I5> z?+pmg{a0gtYR6TtClk^o8kjqburRjm4poH>QMnsm9yCPTji$P`2Ku%D9}hHZaB6c* zGb+EdGgK9+00q83Fv2@4>BEHW`WMPWy77f=+p5C|cBotr13T-(gR=Aw1vpvg;{*G9 zggLwEgM?rC(Zd|yN6h&RgcgV%ZjIIL1B`Xru!(DJ?XkuH@;lJc*^XNxc32CfQ2V-? zTN>6jH)xWesV>^21sV>wL6{9=op#Sb+)Bu{Sq%eyosBJZ#Gq;&Eio-1-57{99ztC@ zHqh8)?C-2=Z8Vx1z{4$@(x~g~tbcPB5Q8FK|16aTgRA}wN{JcJ4IH(9|S>eEs`FY(?4IB-{{Jh>_$m<)B$U-vG%N{gn zt-<`fzF^3YBi5hk7&d{Q)-K%U^$SD48Z1+Ph1MQNg=8~7ua_7;rwDocO#7cw{N;*{ z*Ix|TPsC4;`b_>faO4wSAEuta%5YwXf>(J-{rR{85VGdjZXPr zHMM_41*@mjZNuw-{+^HH#q``}ID`&w8b9w3#@5;#OgA6@E9juu^E#T>S^Rxwl8i%9 z%lef59KVa`BNFE4@4U{gBQ8zz%j?XA^!=Y%)a5h+8X_n$-!$O)(NDx^&hOB{Xut{M0 zJMY~y9#8C5wtrMxX|6SM?)kjuo^#%P_l@uCuFxxU91g)PS3DvnFd>49(eN-!5N)-{ z$M?6yB1_ge!TMTnTb@l~D7-dKV^}POGo4{L-g z+kEdMlVO3<^C`%Bgp!_5>G_l%!#))(hA!Kzi;+6ZBcreab&z&jmMD3`zz>CT)t`JG;dYoO_zsJ6;Fl=wc<%OsAe{7t`P`N zANi829H9Pug%kf|v-VFluO^=St#ifxo(ICdBX8aF6p3;^$u9XHv`D^v)SrRR$%h`U zYTsRS2;C_g`QV#^L-wYkKLh$)d@|d2LGWb~fmhIwEnrx{DcNXtHFop zhZk6=IIifzFm^RZT8wC{wk>8D!qBQW8TH||@UF&aEZkPTsjMl|60WY@(G<3MQ*(^E zJ+;Qp#+KTq#^=JKIch{BFt=)jXbv|wM|Mw;N|vn_O^rM1{LzSimDr$brDeu4|1$qd zxaGiKj{2~BM2msPb~}T-*$8AKkc~h#0@(;;Ban?i zHUfWo5xC*`i(9hiic7}xE<7ZJJkS@*NuHHG$6N!Jv&oeogM2-C|0Mv=VnOosEF!KC zf0RrndkF{kUo399u=v2=U|c5F^`kEnd9M)&jK%K8P&=uQp=Ks8dKT{^a{~%O{v}I( z*i8d znwMj7tFPyhLl2zmC>rj^z`ZhgTAF8N|3#;Cj2*uf z@yh;BoX2l<&wp48yxsMXG|$PzW`|77pDFu4%99DFM~^Sn12NBN_c4eUDDg!k-aRM} z{I_TS^E9Ev#-udQSdnp@K~(g>>Fx(cN_(L*QzjlyVhEQt@X&hcI4cLoqy8U@ za9jokJqN-FoJ<~-<{-@mzSMh0Wukg6x#LTXH8#8sa<(05o|+gDd9II{0gSwSsWBHc}gZ`pOo=A(wu)&3k-C9Tbd^j0mRff zSN4DG1bcxVpQ8t=7mRhE)orQn_vC@Au{N35f-s)5!+6dLV_&y_WbIDBXYqcFK$SCx z!kH0D%y3i-jCM9cSx;cx_+0V=N$5DEn=`jlNh$rr?3);>eONd9EPoLOw#e5 z9K0n1XFUgE6t_gV$dU2tlVda|8g}-{;q6u>A}+c)XdB)+;hSX-ktrUN>*(Rub7(Fe zWkWTHmS(;is@4;`yfxCiU1eS12j2GVuYeS;q&&MGhmfEOA>-{{8Fy*%J)>Ikm&ZOD z?>Z_2XFQ3Y)9eG|p2UBm14$q96h@~T8Z}-m-a3jY>Eo{KJl|2Jq ziICioJwxbR>&)NTYj<7UH)UWT`W=kca{Ab%YfH*p4h&&wV1%ksdzg6=ql~;l^VH2x zb>!9jaA+3A$~-TNhj`ZGaBvgS*{qr8hrdPaz@5%VV6WK=I~DZREXd*EjW*Mh7=+5P z{^ugCsH=0}w-eSK9HprZ7g|!|9Us8NYA`z|&3ZTwY~36{9G!Zi;f^+VTQiU8=8zTC zxSl8UczK>nH*v9iCe7as{ri{6BqI96 z+|?)pqn;Ok2IDety(3IPon1(xs1<3BV19wqU0$rWGXAIw_2ELzSoMLUCym-DApIsT)C1i7KdLp6Hhy9+|qHB~aP=wZ6BwZLyY`#-0M7hlIRrJ28?d*y5Nd({o; zd84oRsH}TiI^MRb@x!4nZ)2Rq4_(^xU+sg4=g?_<>xrKb5_*$Rw0+<*G5 zRK~sFbX>J{1JG?hwkdEqRz}yg7N`2CE*#Sm<%MH(6G{9te88f@>o3_kq3lq#T-8wV z(saW_JM44yhrS;&26pypfvd4kL$6OWkH4gOJaLn*-{s^ENx!bg+cA&j>GL?(xt`at zjCq{vxT>2aBS+ak-TY+2AJ2gYQ7pr|tV$amIVa8G;k$sOU4OJc!JdNinVC&Q3fY`V zmt9&mozrIX_PCWzFS2Rh0p3UOkWE5DZ_;n0Bg^gjLp~Qx%IER%ZfD_;{zWp0Eer~? z=Qyn$xs=*+tRxk}z7?#Q*D-+Pj>c)-1ialNh$#8upcKD}6npIj4!aU7zgI)A zTQe(BbviLG%C9}I*zeW6Xinr8rTS-S^9s!CPR*;cG*xeqD6sw6HZd>qW#l!>U)}sD z%{$M5MU?O-RKkBv3BQRI*WL$Px82lE8Z*weYy`3q$VT8#C<2a!xofK75B~Wke51)^ zD_{`tWxy?fuM+)QGIEro^?-+QLwyCX6*nXPzwLa;QFA=E zRXFx|9SdjAaP>l_5VS@39Qhq=01CIa(p|98lmD!1pICqI_r9~da0!t~AGaBC7;?1O zb0a$aBlqPDdJ$+be8}e?VjNNG_8!WS?t-4&oGY{3-cq+q=I1n69g;na&);EQ_mQl0 zdk^Mpwq#$0Y{>1>@^c=uIzh1C!slho{eJpKVYStp6dy)*-T$$gOnyUUWj?Ald4zw}=^Q7u0)@}AQSpL;{BTmB@YI>ipiNL_ z1@q~7k{X@;rKXP2il7>p=`TBNKKpsZMy=;&>asoj_-FaQN6F)tK?~z?VmE%~{rI%fLcqcfHNuYgW=eCnLC^&13e^g9XBd(|Ou>-PrG zql(@)4+xs|+XCQ4MIXJ}A`9#H0(5B}Q;oX}1?>=X#EK=heERoP@lw^c*oOwVp8d$DsQd zNrV3i`c&uOe=&{x86}^2KK_D5Pcy8#SEsQv0~hF$uZs&&*A|P$+VLNrIx%s)-iS3D zb#%Z!igWn&5o1?VWJhh2Q6G!6MUC3_J)$ns+}ae5h3oxGroE$4dL2W zZOh8mRV(%^YpPmmY!2XEftGe0_20A7Fe0tdl9h&rcLWS_(r83ad$Od2&;H|}e??8x z)<>aOwaTb4Dk>^=R~yy9LPm9oQN7IY*F{>QvDUUoD@|^yv74HjrX92?9F1CortGd{ zyN%JdS~j`Lz_ep+5$k<~rbuLBM$=mUBc@Xopc!F0w*ONM_KjyywL+Qlv-?Kn*3c#d zZgV-kmqB@^5{O2OhT4{TdL5(uiOr!+S{bN#lf|fzN#mkhWcPuVa1QS5<*V zdnDH9{UE~^z)NprBfV`0LVGdR=lvH$-d?c(EXS|``n0!WocDJOeQK~w{}ow0iw@al zecn$pJfZ}7{LJ}3uk=fm9PdvVa-2vXciFGSFAf^Tg!j*>7q8UBd7lmutFa%9A4Dfz z->39akOdXwcrtt`LqDh(3|WrNFwXEFGW54721AycPX8wv`p+l@hF@R{lHM}y(V>*| z`SbpXAwLJPjm-G{ROxg6g-=N;SglY7nfh;lLF2RjHlHoY@FFqO8rKh#PNGA5X}X4} z@%wfB9%iOI;|$+{jt3vs=Y4bmEi7m(XJZpM(?0}*YLE4KKgsXa_9_4Qd5`rt547(l z5!UDT`Yt{MBAU&YhC^Vb>x-b0U1~$0LK(0g!)suq>-&PXq)+LyT&6zbH_;&( z_Mi91MbqfhSn2I6AVgE*DHiKLkfPWf)=!6AJL#g^l))jzq;-JS5}psQEA;ivIyWCgm6a diff --git a/settings.gradle b/settings.gradle index b2f795f9a..a67a0310f 100644 --- a/settings.gradle +++ b/settings.gradle @@ -208,8 +208,8 @@ dependencyResolutionManagement { library('sqlite','org.xerial','sqlite-jdbc').version('3.41.2.2') library('javax.annotation','javax.annotation','javax.annotation-api').version('1.3.2') - library('parquet-column', 'org.apache.parquet','parquet-column').version('1.13.1') - library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.13.1') + library('parquet-column', 'org.apache.parquet','parquet-column').version('1.14.0') + library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.14.0') library('curator-framework', 'org.apache.curator','curator-framework').version('5.6.0') library('curator-x-discovery', 'org.apache.curator','curator-x-discovery').version('5.6.0') diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle index 7b0de5202..08443bb03 100644 --- a/third-party/parquet-floor/build.gradle +++ b/third-party/parquet-floor/build.gradle @@ -9,7 +9,7 @@ java { } dependencies { - implementation ('org.apache.parquet:parquet-column:1.13.1') { + implementation ('org.apache.parquet:parquet-column:1.14.0') { transitive = true } implementation('org.apache.parquet:parquet-hadoop:1.13.1') { diff --git a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java index a9c3231de..9b0fda553 100644 --- a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java +++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java @@ -1,6 +1,7 @@ package org.apache.hadoop.conf; public class Configuration { + public Configuration(boolean x) {} public boolean getBoolean(String x, boolean y) { return y; From 801cf4b5da9a3d06888af1191aaabb257fc6eb02 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jun 2024 08:59:40 +0200 Subject: [PATCH 12/24] (search) Fix bad practice usage of innerHTML to set what should be text content. --- .../search-service/resources/static/search/tts.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-application/search-service/resources/static/search/tts.js b/code/services-application/search-service/resources/static/search/tts.js index 3ad24f828..20ee9f370 100644 --- a/code/services-application/search-service/resources/static/search/tts.js +++ b/code/services-application/search-service/resources/static/search/tts.js @@ -27,7 +27,7 @@ function setupTypeahead() { for (i=0;i Date: Wed, 12 Jun 2024 09:05:57 +0200 Subject: [PATCH 13/24] (atags) Fix duckdb SQL injection The input comes from the config file so this isn't a very realistic threat vector, and even if it wasn't it's a query in an empty duckdb instance; but adding a validation check to provide a better error message. --- .../marginalia/atags/source/AnchorTagsImpl.java | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index bd537c6e5..7ff09289f 100644 --- a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -6,6 +6,7 @@ import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.Connection; import java.sql.DriverManager; @@ -24,6 +25,10 @@ public class AnchorTagsImpl implements AnchorTagsSource { logger.info("Loading atags from " + atagsPath); + if (!Files.exists(atagsPath)) { + throw new IllegalArgumentException("atags file does not exist: " + atagsPath); + } + try (var stmt = duckdbConnection.createStatement()) { // Insert the domains into a temporary table, then use that to filter the atags table @@ -35,13 +40,18 @@ public class AnchorTagsImpl implements AnchorTagsSource { } } - // Project the atags table down to only the relevant domains. This looks like an SQL injection - // vulnerability if you're a validation tool, but the string comes from a trusted source. + // This is a SQL injection vulnerability if you're a validation tool, but the string comes from a trusted source + // -- we validate nonetheless to present a better error message + String path = atagsPath.toAbsolutePath().toString(); + if (path.contains("'")) { + throw new IllegalArgumentException("atags file path contains a single quote: " + path + " and would break the query."); + } + stmt.executeUpdate(""" create table atags as select * from '%s' where dest in (select * from domains) - """.formatted(atagsPath.toAbsolutePath())); + """.formatted(path)); // Free up the memory used by the domains table stmt.executeUpdate("drop table domains"); From 6839415a0bccf89a91443a504551eee9998bd193 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jun 2024 09:07:54 +0200 Subject: [PATCH 14/24] (crawler) Fetch TLS instead of SSL context --- .../crawl/retreival/fetcher/socket/NoSecuritySSL.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java index b6b8a5894..4833c6e7e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java @@ -32,7 +32,7 @@ public class NoSecuritySSL { @SneakyThrows public static SSLSocketFactory buildSocketFactory() { // Install the all-trusting trust manager - final SSLContext sslContext = SSLContext.getInstance("SSL"); + final SSLContext sslContext = SSLContext.getInstance("TLS"); sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); var clientSessionContext = sslContext.getClientSessionContext(); From 0ffbbaf4b961fd3c15650c98b8da4a882a7c9e42 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 12 Jun 2024 09:14:12 +0200 Subject: [PATCH 15/24] (crawler) Update WARC builder to use SHA-256 for digests --- .../crawl/retreival/fetcher/warc/WarcDigestBuilder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java index b3a8ab4f7..79b4c86a2 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java @@ -8,7 +8,7 @@ import java.security.NoSuchAlgorithmException; class WarcDigestBuilder { private final MessageDigest digest; - private static final String digestAlgorithm = "SHA-1"; + private static final String digestAlgorithm = "SHA-256"; public WarcDigestBuilder() throws NoSuchAlgorithmException { this.digest = MessageDigest.getInstance(digestAlgorithm); From 9974b31a09b605cc12c2010e576d0aec4eb73867 Mon Sep 17 00:00:00 2001 From: Jaseem Abid Date: Wed, 12 Jun 2024 12:45:49 +0100 Subject: [PATCH 16/24] Don't track build files(libcpp.so) with git --- code/libraries/array/cpp/.gitignore | 1 + code/libraries/array/cpp/resources/libcpp.so | Bin 16040 -> 0 bytes 2 files changed, 1 insertion(+) create mode 100644 code/libraries/array/cpp/.gitignore delete mode 100755 code/libraries/array/cpp/resources/libcpp.so diff --git a/code/libraries/array/cpp/.gitignore b/code/libraries/array/cpp/.gitignore new file mode 100644 index 000000000..a52549f59 --- /dev/null +++ b/code/libraries/array/cpp/.gitignore @@ -0,0 +1 @@ +resources/libcpp.so diff --git a/code/libraries/array/cpp/resources/libcpp.so b/code/libraries/array/cpp/resources/libcpp.so deleted file mode 100755 index 9d6d9616786258903a462cbf295a9cdf0150d75c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16040 zcmeHOeQZ=$cE4kf!+`0`@R8sKwoItAvNxLv<}1tsVa8wFM;?S?J0&667h@0DySB&j zj0vRvATl7mOonyTRvT4Sg|zvD{!wkDZnbKZWNpk>X_n$-!$O)(NDx^&hOB{Xut{M0 zJMY~y9#8C5wtrMxX|6SM?)kjuo^#%P_l@uCuFxxU91g)PS3DvnFd>49(eN-!5N)-{ z$M?6yB1_ge!TMTnTb@l~D7-dKV^}POGo4{L-g z+kEdMlVO3<^C`%Bgp!_5>G_l%!#))(hA!Kzi;+6ZBcreab&z&jmMD3`zz>CT)t`JG;dYoO_zsJ6;Fl=wc<%OsAe{7t`P`N zANi829H9Pug%kf|v-VFluO^=St#ifxo(ICdBX8aF6p3;^$u9XHv`D^v)SrRR$%h`U zYTsRS2;C_g`QV#^L-wYkKLh$)d@|d2LGWb~fmhIwEnrx{DcNXtHFop zhZk6=IIifzFm^RZT8wC{wk>8D!qBQW8TH||@UF&aEZkPTsjMl|60WY@(G<3MQ*(^E zJ+;Qp#+KTq#^=JKIch{BFt=)jXbv|wM|Mw;N|vn_O^rM1{LzSimDr$brDeu4|1$qd zxaGiKj{2~BM2msPb~}T-*$8AKkc~h#0@(;;Ban?i zHUfWo5xC*`i(9hiic7}xE<7ZJJkS@*NuHHG$6N!Jv&oeogM2-C|0Mv=VnOosEF!KC zf0RrndkF{kUo399u=v2=U|c5F^`kEnd9M)&jK%K8P&=uQp=Ks8dKT{^a{~%O{v}I( z*i8d znwMj7tFPyhLl2zmC>rj^z`ZhgTAF8N|3#;Cj2*uf z@yh;BoX2l<&wp48yxsMXG|$PzW`|77pDFu4%99DFM~^Sn12NBN_c4eUDDg!k-aRM} z{I_TS^E9Ev#-udQSdnp@K~(g>>Fx(cN_(L*QzjlyVhEQt@X&hcI4cLoqy8U@ za9jokJqN-FoJ<~-<{-@mzSMh0Wukg6x#LTXH8#8sa<(05o|+gDd9II{0gSwSsWBHc}gZ`pOo=A(wu)&3k-C9Tbd^j0mRff zSN4DG1bcxVpQ8t=7mRhE)orQn_vC@Au{N35f-s)5!+6dLV_&y_WbIDBXYqcFK$SCx z!kH0D%y3i-jCM9cSx;cx_+0V=N$5DEn=`jlNh$rr?3);>eONd9EPoLOw#e5 z9K0n1XFUgE6t_gV$dU2tlVda|8g}-{;q6u>A}+c)XdB)+;hSX-ktrUN>*(Rub7(Fe zWkWTHmS(;is@4;`yfxCiU1eS12j2GVuYeS;q&&MGhmfEOA>-{{8Fy*%J)>Ikm&ZOD z?>Z_2XFQ3Y)9eG|p2UBm14$q96h@~T8Z}-m-a3jY>Eo{KJl|2Jq ziICioJwxbR>&)NTYj<7UH)UWT`W=kca{Ab%YfH*p4h&&wV1%ksdzg6=ql~;l^VH2x zb>!9jaA+3A$~-TNhj`ZGaBvgS*{qr8hrdPaz@5%VV6WK=I~DZREXd*EjW*Mh7=+5P z{^ugCsH=0}w-eSK9HprZ7g|!|9Us8NYA`z|&3ZTwY~36{9G!Zi;f^+VTQiU8=8zTC zxSl8UczK>nH*v9iCe7as{ri{6BqI96 z+|?)pqn;Ok2IDety(3IPon1(xs1<3BV19wqU0$rWGXAIw_2ELzSoMLUCym-DApIsT)C1i7KdLp6Hhy9+|qHB~aP=wZ6BwZLyY`#-0M7hlIRrJ28?d*y5Nd({o; zd84oRsH}TiI^MRb@x!4nZ)2Rq4_(^xU+sg4=g?_<>xrKb5_*$Rw0+<*G5 zRK~sFbX>J{1JG?hwkdEqRz}yg7N`2CE*#Sm<%MH(6G{9te88f@>o3_kq3lq#T-8wV z(saW_JM44yhrS;&26pypfvd4kL$6OWkH4gOJaLn*-{s^ENx!bg+cA&j>GL?(xt`at zjCq{vxT>2aBS+ak-TY+2AJ2gYQ7pr|tV$amIVa8G;k$sOU4OJc!JdNinVC&Q3fY`V zmt9&mozrIX_PCWzFS2Rh0p3UOkWE5DZ_;n0Bg^gjLp~Qx%IER%ZfD_;{zWp0Eer~? z=Qyn$xs=*+tRxk}z7?#Q*D-+Pj>c)-1ialNh$#8upcKD}6npIj4!aU7zgI)A zTQe(BbviLG%C9}I*zeW6Xinr8rTS-S^9s!CPR*;cG*xeqD6sw6HZd>qW#l!>U)}sD z%{$M5MU?O-RKkBv3BQRI*WL$Px82lE8Z*weYy`3q$VT8#C<2a!xofK75B~Wke51)^ zD_{`tWxy?fuM+)QGIEro^?-+QLwyCX6*nXPzwLa;QFA=E zRXFx|9SdjAaP>l_5VS@39Qhq=01CIa(p|98lmD!1pICqI_r9~da0!t~AGaBC7;?1O zb0a$aBlqPDdJ$+be8}e?VjNNG_8!WS?t-4&oGY{3-cq+q=I1n69g;na&);EQ_mQl0 zdk^Mpwq#$0Y{>1>@^c=uIzh1C!slho{eJpKVYStp6dy)*-T$$gOnyUUWj?Ald4zw}=^Q7u0)@}AQSpL;{BTmB@YI>ipiNL_ z1@q~7k{X@;rKXP2il7>p=`TBNKKpsZMy=;&>asoj_-FaQN6F)tK?~z?VmE%~{rI%fLcqcfHNuYgW=eCnLC^&13e^g9XBd(|Ou>-PrG zql(@)4+xs|+XCQ4MIXJ}A`9#H0(5B}Q;oX}1?>=X#EK=heERoP@lw^c*oOwVp8d$DsQd zNrV3i`c&uOe=&{x86}^2KK_D5Pcy8#SEsQv0~hF$uZs&&*A|P$+VLNrIx%s)-iS3D zb#%Z!igWn&5o1?VWJhh2Q6G!6MUC3_J)$ns+}ae5h3oxGroE$4dL2W zZOh8mRV(%^YpPmmY!2XEftGe0_20A7Fe0tdl9h&rcLWS_(r83ad$Od2&;H|}e??8x z)<>aOwaTb4Dk>^=R~yy9LPm9oQN7IY*F{>QvDUUoD@|^yv74HjrX92?9F1CortGd{ zyN%JdS~j`Lz_ep+5$k<~rbuLBM$=mUBc@Xopc!F0w*ONM_KjyywL+Qlv-?Kn*3c#d zZgV-kmqB@^5{O2OhT4{TdL5(uiOr!+S{bN#lf|fzN#mkhWcPuVa1QS5<*V zdnDH9{UE~^z)NprBfV`0LVGdR=lvH$-d?c(EXS|``n0!WocDJOeQK~w{}ow0iw@al zecn$pJfZ}7{LJ}3uk=fm9PdvVa-2vXciFGSFAf^Tg!j*>7q8UBd7lmutFa%9A4Dfz z->39akOdXwcrtt`LqDh(3|WrNFwXEFGW54721AycPX8wv`p+l@hF@R{lHM}y(V>*| z`SbpXAwLJPjm-G{ROxg6g-=N;SglY7nfh;lLF2RjHlHoY@FFqO8rKh#PNGA5X}X4} z@%wfB9%iOI;|$+{jt3vs=Y4bmEi7m(XJZpM(?0}*YLE4KKgsXa_9_4Qd5`rt547(l z5!UDT`Yt{MBAU&YhC^Vb>x-b0U1~$0LK(0g!)suq>-&PXq)+LyT&6zbH_;&( z_Mi91MbqfhSn2I6AVgE*DHiKLkfPWf)=!6AJL#g^l))jzq;-JS5}psQEA;ivIyWCgm6a From 0dd14a4bd0e7a3b56584ce2d5f24dbaa14e39783 Mon Sep 17 00:00:00 2001 From: Jaseem Abid Date: Wed, 12 Jun 2024 12:46:15 +0100 Subject: [PATCH 17/24] Specify C++ standard in build command The default C++ language standard on macOS is gnu++98, which won't build this module. Full error: ``` > Task :code:libraries:array:cpp:compileCpp FAILED src/main/cpp/cpphelpers.cpp:28:5: error: expected expression [](const p64x2& fst, const p64x2& snd) { ^ ``` --- code/libraries/array/cpp/compile.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 code/libraries/array/cpp/compile.sh diff --git a/code/libraries/array/cpp/compile.sh b/code/libraries/array/cpp/compile.sh old mode 100644 new mode 100755 index 89c6d1d65..477135692 --- a/code/libraries/array/cpp/compile.sh +++ b/code/libraries/array/cpp/compile.sh @@ -7,4 +7,4 @@ if ! which ${CXX} > /dev/null; then exit 0 fi -${CXX} -O3 -march=native -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so \ No newline at end of file +${CXX} -O3 -march=native -std=c++14 -shared -Isrc/main/public src/main/cpp/*.cpp -o resources/libcpp.so From 2168b7cf7d37c15552dafdc84f25d7d6dc3b6778 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 16 Jun 2024 10:01:19 +0200 Subject: [PATCH 18/24] (docs) Update docs with clearer references to the full guide The commit also mentions the non-docker install --- run/readme.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/run/readme.md b/run/readme.md index 0a890feba..041f55761 100644 --- a/run/readme.md +++ b/run/readme.md @@ -3,11 +3,11 @@ This directory is a staging area for running the system. It contains scripts and templates for installing the system on a server, and for running it locally. -See [https://docs.marginalia.nu/](https://docs.marginalia.nu/) for additional -documentation. - ## Requirements +**x86-64 Linux** - The system is only tested on x86-64 Linux. It may work on other +platforms, but for lack of suitable hardware, this can not be guaranteed. + **Docker** - It is a bit of a pain to install, but if you follow [this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems. @@ -15,7 +15,12 @@ documentation. The civilized way of installing this is to use [SDKMAN](https://sdkman.io/); graalce is a good distribution choice but it doesn't matter too much. -## Set up +## Quick Set up + +[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install +and operation of the search engine. This is a quick guide for the impatient. + +--- To go from a clean check out of the git repo to a running search engine, follow these steps. @@ -51,6 +56,8 @@ you for which installation mode you want to use. The options are: 2. Full Marginalia Search instance - This will install an instance of the search engine configured like [search.marginalia.nu](https://search.marginalia.nu). This is useful for local development and testing. +3. Non-docker installation - This will install the system outside of docker. + This is still an experimental run-mode. It will also prompt you for account details for a new mariadb instance, which will be created for you. The database will be initialized with the schema and data required From 54caf17107ae4cc7f94000428d9104ce97abe74c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 16 Jun 2024 10:22:07 +0200 Subject: [PATCH 19/24] (docs) Amend install instructions for non-docker install --- run/install.sh | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/run/install.sh b/run/install.sh index 4b2bb359d..b9174667f 100755 --- a/run/install.sh +++ b/run/install.sh @@ -149,7 +149,11 @@ elif [ "${INSTANCE_TYPE}" == "4" ]; then envsubst < install/docker-compose-scaffold.yml.template >${INSTALL_DIR}/docker-compose.yml cat < ${INSTALL_DIR}/README -Quick note about running Marginalia Search in a non-docker environment: +Quick note about running Marginalia Search in a non-docker environment. + +This installation mode is not recommended, as it is significantly more complex +to set up and maintain: You will need to manage the services yourself, including +port management. * The template sets up a sample (in-docker) setup for mariadb and zookeeper. These can also be run outside @@ -177,13 +181,16 @@ A working setup needs at all the services * index [ http port is internal ] * executor [ http port is internal ] -The index and executor services should be on the same partition e.g. index:1 and executor:1, -which should be a number larger than 0. You can have multiple pairs of index and executor partitions, -but the pair should run on the same physical machine with the same install directory. +Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service. -The query service can use any partition number. +* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition +number is the last digit of the service name, and should be positive. You can have multiple pairs of index +and executor partitions, but the pair should run on the same physical machine with the same install directory. + +* The query service can use any partition number. + +* The control service should be on partition 1. -The control service should be on partition 1. EOF echo From d0d6bb173c90d7c63c77b3655af194ff352fbdf4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jun 2024 12:40:25 +0200 Subject: [PATCH 20/24] (control) Fix warc data http status filter default value --- .../nu/marginalia/control/node/svc/ControlCrawlDataService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java index a1ce22c2e..31928bdb6 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java @@ -86,7 +86,7 @@ public class ControlCrawlDataService { ORDER BY httpStatus """); while (rs.next()) { - final boolean isCurrentFilter = selectedContentType.equals(rs.getString("httpStatus")); + final boolean isCurrentFilter = selectedHttpStatus.equals(rs.getString("httpStatus")); final int status = rs.getInt("httpStatus"); final int cnt = rs.getInt("cnt"); From 67703e2274eb37c519e27b970e9bd7110a96f0ca Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 17 Jun 2024 13:15:15 +0200 Subject: [PATCH 21/24] (run) Update install.sh with stronger warnings against non-docker install. --- run/install.sh | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/run/install.sh b/run/install.sh index 4b2bb359d..152040666 100755 --- a/run/install.sh +++ b/run/install.sh @@ -41,7 +41,7 @@ echo echo "1) barebones instance (1 node)" echo "2) barebones instance (2 nodes)" echo "3) full Marginalia Search instance?" -echo "4) non-docker install? (not recommended)" +echo "4) non-docker install? (proof-of-concept, not recommended)" echo read -p "Enter 1, 2, 3, or 4: " INSTANCE_TYPE @@ -149,17 +149,24 @@ elif [ "${INSTANCE_TYPE}" == "4" ]; then envsubst < install/docker-compose-scaffold.yml.template >${INSTALL_DIR}/docker-compose.yml cat < ${INSTALL_DIR}/README -Quick note about running Marginalia Search in a non-docker environment: +Quick note about running Marginalia Search in a non-docker environment. -* The template sets up a sample (in-docker) setup for - mariadb and zookeeper. These can also be run outside - of docker, but you will need to update the db.properties - file and "zookeeper-hosts" in the system.properties - file to point to the correct locations/addresses. -* Each service is spawned by the same launcher. When building - the project with "gradlew assemble", the launcher is put in - "code/services-core/single-service-runner/build/distributions/marginalia.tar". - This needs to be extracted. +Beware that this installation mode is more of a proof-of-concept and demonstration that the +system is not unhealthily dependent on docker, than a production-ready setup, and is not +recommended for production use! The container setup is much more robust and easier to manage. + +Note: This script only sets up an install directory, and does not build the system. +You will need to build the system with "gradlew assemble" before you can run it. + +Each service is spawned by the same launcher. After building the project with +"gradlew assemble", the launcher is put in "code/services-core/single-service-runner/build/distributions/marginalia.tar". +This needs to be extracted! + +Note: The template sets up a sample (in-docker) setup for mariadb and zookeeper. These can also be run outside +of docker, but you will need to update the db.properties file and "zookeeper-hosts" in the system.properties +file to point to the correct locations/addresses. + +Running: To launch a process you need to unpack it, and then run the launcher with the appropriate arguments. For example: From d86926be5f7588517e2542b0c93899c07e963ee2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Jul 2024 15:31:47 +0200 Subject: [PATCH 22/24] (crawl) Add new functionality for re-crawling a single domain --- .../executor/client/ExecutorCrawlClient.java | 9 ++ .../api/src/main/protobuf/executor-api.proto | 5 ++ .../nu/marginalia/actor/ExecutorActor.java | 1 + .../actor/ExecutorActorControlService.java | 4 + .../nu/marginalia/actor/task/CrawlActor.java | 4 +- .../marginalia/actor/task/RecrawlActor.java | 2 +- .../actor/task/RecrawlSingleDomainActor.java | 85 +++++++++++++++++++ .../execution/ExecutorCrawlGrpcService.java | 16 ++++ .../mqapi/crawling/CrawlRequest.java | 16 ++++ .../java/nu/marginalia/crawl/CrawlerMain.java | 43 +++++++++- .../node/svc/ControlNodeActionsService.java | 19 +++++ .../node-storage-crawl-parquet-details.hdb | 10 ++- 12 files changed, 208 insertions(+), 6 deletions(-) create mode 100644 code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java diff --git a/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java b/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java index b037702df..25610892d 100644 --- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java +++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorCrawlClient.java @@ -44,6 +44,15 @@ public class ExecutorCrawlClient { .build()); } + public void triggerRecrawlSingleDomain(int node, FileStorageId fid, String domainName) { + channelPool.call(ExecutorCrawlApiBlockingStub::triggerSingleDomainRecrawl) + .forNode(node) + .run(RpcFileStorageIdWithDomainName.newBuilder() + .setFileStorageId(fid.id()) + .setTargetDomainName(domainName) + .build()); + } + public void triggerConvert(int node, FileStorageId fid) { channelPool.call(ExecutorCrawlApiBlockingStub::triggerConvert) .forNode(node) diff --git a/code/execution/api/src/main/protobuf/executor-api.proto b/code/execution/api/src/main/protobuf/executor-api.proto index 565770acc..2858d60b9 100644 --- a/code/execution/api/src/main/protobuf/executor-api.proto +++ b/code/execution/api/src/main/protobuf/executor-api.proto @@ -22,6 +22,7 @@ service ExecutorApi { service ExecutorCrawlApi { rpc triggerCrawl(RpcFileStorageId) returns (Empty) {} rpc triggerRecrawl(RpcFileStorageId) returns (Empty) {} + rpc triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName) returns (Empty) {} rpc triggerConvert(RpcFileStorageId) returns (Empty) {} rpc triggerConvertAndLoad(RpcFileStorageId) returns (Empty) {} rpc loadProcessedData(RpcFileStorageIds) returns (Empty) {} @@ -55,6 +56,10 @@ message RpcProcessId { message RpcFileStorageId { int64 fileStorageId = 1; } +message RpcFileStorageIdWithDomainName { + int64 fileStorageId = 1; + string targetDomainName = 2; +} message RpcFileStorageIds { repeated int64 fileStorageIds = 1; } diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActor.java b/code/execution/java/nu/marginalia/actor/ExecutorActor.java index d04b3eaa8..e59ecd9c3 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActor.java @@ -3,6 +3,7 @@ package nu.marginalia.actor; public enum ExecutorActor { CRAWL, RECRAWL, + RECRAWL_SINGLE_DOMAIN, CONVERT_AND_LOAD, PROC_CONVERTER_SPAWNER, PROC_LOADER_SPAWNER, diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java index 6f37d7abd..591119f8a 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -26,6 +26,7 @@ public class ExecutorActorControlService { private final ExecutorActorStateMachines stateMachines; public Map actorDefinitions = new HashMap<>(); private final int node; + @Inject public ExecutorActorControlService(MessageQueueFactory messageQueueFactory, BaseServiceParams baseServiceParams, @@ -33,6 +34,7 @@ public class ExecutorActorControlService { ConvertAndLoadActor convertAndLoadActor, CrawlActor crawlActor, RecrawlActor recrawlActor, + RecrawlSingleDomainActor recrawlSingleDomainActor, RestoreBackupActor restoreBackupActor, ConverterMonitorActor converterMonitorFSM, CrawlerMonitorActor crawlerMonitorActor, @@ -57,6 +59,8 @@ public class ExecutorActorControlService { register(ExecutorActor.CRAWL, crawlActor); register(ExecutorActor.RECRAWL, recrawlActor); + register(ExecutorActor.RECRAWL_SINGLE_DOMAIN, recrawlSingleDomainActor); + register(ExecutorActor.CONVERT, convertActor); register(ExecutorActor.RESTORE_BACKUP, restoreBackupActor); register(ExecutorActor.CONVERT_AND_LOAD, convertAndLoadActor); diff --git a/code/execution/java/nu/marginalia/actor/task/CrawlActor.java b/code/execution/java/nu/marginalia/actor/task/CrawlActor.java index 3e097554f..0a7428888 100644 --- a/code/execution/java/nu/marginalia/actor/task/CrawlActor.java +++ b/code/execution/java/nu/marginalia/actor/task/CrawlActor.java @@ -50,7 +50,9 @@ public class CrawlActor extends RecordActorPrototype { storageService.relateFileStorages(storage.id(), dataArea.id()); // Send convert request - long msgId = mqCrawlerOutbox.sendAsync(new CrawlRequest(List.of(fid), dataArea.id())); + long msgId = mqCrawlerOutbox.sendAsync( + CrawlRequest.forSpec(fid, dataArea.id()) + ); yield new Crawl(msgId); } diff --git a/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java b/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java index 2b748ceda..0eefd4ef5 100644 --- a/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java +++ b/code/execution/java/nu/marginalia/actor/task/RecrawlActor.java @@ -59,7 +59,7 @@ public class RecrawlActor extends RecordActorPrototype { refreshService.synchronizeDomainList(); - long id = mqCrawlerOutbox.sendAsync(new CrawlRequest(null, fid)); + long id = mqCrawlerOutbox.sendAsync(CrawlRequest.forRecrawl(fid)); yield new Crawl(id, fid, cascadeLoad); } diff --git a/code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java b/code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java new file mode 100644 index 000000000..990da5aa9 --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/task/RecrawlSingleDomainActor.java @@ -0,0 +1,85 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorResumeBehavior; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.actor.state.Resume; +import nu.marginalia.mq.MqMessageState; +import nu.marginalia.mq.outbox.MqOutbox; +import nu.marginalia.mqapi.crawling.CrawlRequest; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageType; + +@Singleton +public class RecrawlSingleDomainActor extends RecordActorPrototype { + + private final MqOutbox mqCrawlerOutbox; + private final FileStorageService storageService; + private final ActorProcessWatcher processWatcher; + + /** Initial step + * @param storageId - the id of the storage to recrawl + * @param targetDomainName - domain to be recrawled + */ + public record Initial(FileStorageId storageId, String targetDomainName) implements ActorStep {} + + /** The action step */ + @Resume(behavior = ActorResumeBehavior.RETRY) + public record Crawl(long messageId) implements ActorStep {} + + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch (self) { + case Initial (FileStorageId fid, String targetDomainName) -> { + var crawlStorage = storageService.getStorage(fid); + + if (crawlStorage == null) yield new Error("Bad storage id"); + if (crawlStorage.type() != FileStorageType.CRAWL_DATA) yield new Error("Bad storage type " + crawlStorage.type()); + + long id = mqCrawlerOutbox.sendAsync( + CrawlRequest.forSingleDomain(targetDomainName, fid) + ); + + yield new Crawl(id); + } + case Crawl (long msgId) -> { + var rsp = processWatcher.waitResponse( + mqCrawlerOutbox, + ProcessService.ProcessId.CRAWLER, + msgId); + + if (rsp.state() != MqMessageState.OK) { + yield new Error("Crawler failed"); + } + + yield new End(); + } + default -> new End(); + }; + } + + @Override + public String describe() { + return "Run the crawler only re-fetching a single domain"; + } + + @Inject + public RecrawlSingleDomainActor(ActorProcessWatcher processWatcher, + ProcessOutboxes processOutboxes, + FileStorageService storageService, + Gson gson) + { + super(gson); + + this.processWatcher = processWatcher; + this.mqCrawlerOutbox = processOutboxes.getCrawlerOutbox(); + this.storageService = storageService; + } + +} diff --git a/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java index b95f64d01..206480155 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorCrawlGrpcService.java @@ -47,6 +47,22 @@ public class ExecutorCrawlGrpcService extends ExecutorCrawlApiGrpc.ExecutorCrawl } } + @Override + public void triggerSingleDomainRecrawl(RpcFileStorageIdWithDomainName request, StreamObserver responseObserver) { + try { + actorControlService.startFrom(ExecutorActor.RECRAWL_SINGLE_DOMAIN, + new RecrawlSingleDomainActor.Initial( + FileStorageId.of(request.getFileStorageId()), + request.getTargetDomainName())); + + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + } + catch (Exception e) { + responseObserver.onError(e); + } + } + @Override public void triggerConvert(RpcFileStorageId request, StreamObserver responseObserver) { try { diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java index 40cd30ce4..ff090140f 100644 --- a/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java +++ b/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java @@ -14,8 +14,24 @@ public class CrawlRequest { */ public List specStorage; + /** (optional) Name of a single domain to be re-crawled */ + public String targetDomainName; + /** File storage where the crawl data will be written. If it contains existing crawl data, * this crawl data will be referenced for e-tags and last-mofified checks. */ public FileStorageId crawlStorage; + + public static CrawlRequest forSpec(FileStorageId specStorage, FileStorageId crawlStorage) { + return new CrawlRequest(List.of(specStorage), null, crawlStorage); + } + + public static CrawlRequest forSingleDomain(String targetDomainName, FileStorageId crawlStorage) { + return new CrawlRequest(null, targetDomainName, crawlStorage); + } + + public static CrawlRequest forRecrawl(FileStorageId crawlStorage) { + return new CrawlRequest(null, null, crawlStorage); + } + } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 1b04c0f96..5173af756 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -23,6 +23,7 @@ import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.service.ProcessMainClass; import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -136,7 +137,12 @@ public class CrawlerMain extends ProcessMainClass { var instructions = crawler.fetchInstructions(); try { - crawler.run(instructions.specProvider, instructions.outputDir); + if (instructions.targetDomainName != null) { + crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir); + } + else { + crawler.run(instructions.specProvider, instructions.outputDir); + } instructions.ok(); } catch (Exception ex) { logger.error("Crawler failed", ex); @@ -200,6 +206,26 @@ public class CrawlerMain extends ProcessMainClass { } } + public void runForSingleDomain(String targetDomainName, Path outputDir) throws Exception { + + heartbeat.start(); + + try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log")); + WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); + AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName))) + ) { + var spec = new CrawlSpecRecord(targetDomainName, 1000, null); + var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog); + task.run(); + } + catch (Exception ex) { + logger.warn("Exception in crawler", ex); + } + finally { + heartbeat.shutDown(); + } + } + class CrawlTask implements SimpleBlockingThreadPool.Task { private final CrawlSpecRecord specification; @@ -216,7 +242,8 @@ public class CrawlerMain extends ProcessMainClass { AnchorTagsSource anchorTagsSource, Path outputDir, WarcArchiverIf warcArchiver, - WorkLog workLog) { + WorkLog workLog) + { this.specification = specification; this.anchorTagsSource = anchorTagsSource; this.outputDir = outputDir; @@ -303,11 +330,19 @@ public class CrawlerMain extends ProcessMainClass { private final MqMessage message; private final MqSingleShotInbox inbox; - CrawlRequest(CrawlSpecProvider specProvider, Path outputDir, MqMessage message, MqSingleShotInbox inbox) { + private final String targetDomainName; + + CrawlRequest(CrawlSpecProvider specProvider, + String targetDomainName, + Path outputDir, + MqMessage message, + MqSingleShotInbox inbox) + { this.message = message; this.inbox = inbox; this.specProvider = specProvider; this.outputDir = outputDir; + this.targetDomainName = targetDomainName; } @@ -325,6 +360,7 @@ public class CrawlerMain extends ProcessMainClass { var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, node, UUID.randomUUID()); logger.info("Waiting for instructions"); + var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName()); var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received")); @@ -350,6 +386,7 @@ public class CrawlerMain extends ProcessMainClass { return new CrawlRequest( specProvider, + request.targetDomainName, crawlData.asPath(), msg, inbox); diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index 4b8337898..c385e52e2 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -24,6 +24,7 @@ import java.nio.file.Path; import java.sql.SQLException; import java.util.Arrays; import java.util.List; +import java.util.Objects; import java.util.Set; @Singleton @@ -88,6 +89,9 @@ public class ControlNodeActionsService { Spark.post("/nodes/:id/actions/recrawl", this::triggerAutoRecrawl, redirectControl.renderRedirectAcknowledgement("Recrawling", "..") ); + Spark.post("/nodes/:id/actions/recrawl-single-domain", this::triggerSingleDomainRecrawl, + redirectControl.renderRedirectAcknowledgement("Recrawling", "..") + ); Spark.post("/nodes/:id/actions/process", this::triggerProcess, redirectControl.renderRedirectAcknowledgement("Processing", "..") ); @@ -216,6 +220,21 @@ public class ControlNodeActionsService { return ""; } + private Object triggerSingleDomainRecrawl(Request request, Response response) throws SQLException { + int nodeId = Integer.parseInt(request.params("id")); + + var toCrawl = parseSourceFileStorageId(request.queryParams("source")); + var targetDomainName = Objects.requireNonNull(request.queryParams("targetDomainName")); + + crawlClient.triggerRecrawlSingleDomain( + nodeId, + toCrawl, + targetDomainName + ); + + return ""; + } + private Object triggerNewCrawl(Request request, Response response) throws SQLException { int nodeId = Integer.parseInt(request.params("id")); diff --git a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb index 2be78e283..c06e2cd17 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb @@ -24,12 +24,20 @@

Summary

- + + +
DomainFileDomainFileCrawl
{{domain}} Download Parquet +
+ + + +
+

Contents

From 6401a513d7314a7717b113b4ac9ab8bdf1eae150 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Jul 2024 17:21:03 +0200 Subject: [PATCH 23/24] (crawl) Fix onsubmit confirm dialog for single-site recrawl --- .../control/node/node-storage-crawl-parquet-details.hdb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb index c06e2cd17..32ab39a41 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb @@ -31,10 +31,10 @@ Download Parquet -
+ - +
From ffd970036d4c109a41f5f01b276a5fd9251f5b85 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Jul 2024 05:15:30 +0200 Subject: [PATCH 24/24] (term-frequency) Fix concurrency issues in SentenceExtractor and TermFrequencyExporter How'd This Ever Work? (tm) TermFrequencyExporter was using Math.clamp() incorrectly, and SentenceExtractor was synchronizing on its own instance when initializing shared static members, causing rare issues when spinning multiple SE:s up at once. --- .../java/nu/marginalia/extractor/TermFrequencyExporter.java | 2 +- .../java/nu/marginalia/language/sentence/SentenceExtractor.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 18fb32617..3bcc9cf2a 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -54,7 +54,7 @@ public class TermFrequencyExporter implements ExporterIf { TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); AtomicInteger docCount = new AtomicInteger(); - SimpleBlockingThreadPool sjp = new SimpleBlockingThreadPool("exporter", Math.clamp(2, 16, Runtime.getRuntime().availableProcessors() / 2), 4); + SimpleBlockingThreadPool sjp = new SimpleBlockingThreadPool("exporter", Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 2, 16), 4); Path crawlerLogFile = inputDir.resolve("crawler.log"); for (var item : WorkLog.iterable(crawlerLogFile)) { diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index bb1e3771f..8dd818a34 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -59,7 +59,7 @@ public class SentenceExtractor { logger.error("Could not initialize sentence detector", ex); } - synchronized (this) { + synchronized (SentenceExtractor.class) { if (ngramLexicon == null) { ngramLexicon = new NgramLexicon(models); }