Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions docker/mediaproc/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,17 @@ RUN cd /tmp \

COPY native/philomena /tmp/philomena
COPY docker/mediaproc/safe-rsvg-convert /usr/bin/safe-rsvg-convert
ADD https://github.com/liamwhite/philomena-ris-inference-toolkit/releases/download/v1.0/dinov2-with-registers-base.pt /usr/share/dinov2-with-registers-base.pt

RUN cd /tmp/philomena \
&& cargo build --release -p mediaproc_server \
&& cp target/release/mediaproc_server /usr/bin/mediaproc_server
&& cp target/release/mediaproc_server /usr/bin/mediaproc_server \
&& find target/release/build -regextype posix-extended -regex '^.*\.so(\.[0-9]+)*$' -exec cp '{}' /usr/lib/ ';' \
&& chmod 0644 /usr/share/dinov2-with-registers-base.pt

# Set up unprivileged user account
RUN useradd -ms /bin/bash mediaproc
USER mediaproc
WORKDIR /home/mediaproc
ENV RUST_LOG=trace
CMD ["/usr/bin/mediaproc_server", "0.0.0.0:1500"]
CMD ["/usr/bin/mediaproc_server", "0.0.0.0:1500", "/usr/share/dinov2-with-registers-base.pt"]
91 changes: 85 additions & 6 deletions lib/philomena/duplicate_reports.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ defmodule Philomena.DuplicateReports do
alias Ecto.Multi
alias Philomena.Repo

alias PhilomenaMedia.Features
alias PhilomenaQuery.Search
alias Philomena.DuplicateReports.DuplicateReport
alias Philomena.DuplicateReports.SearchQuery
alias Philomena.DuplicateReports.Uploader
Expand All @@ -32,7 +34,7 @@ defmodule Philomena.DuplicateReports do
source = Repo.preload(source, :intensity)

{source.intensity, source.image_aspect_ratio}
|> find_duplicates(dist: 0.2)
|> find_duplicates_by_intensities(dist: 0.2)
|> where([i, _it], i.id != ^source.id)
|> Repo.all()
|> Enum.map(fn target ->
Expand All @@ -42,6 +44,76 @@ defmodule Philomena.DuplicateReports do
end)
end

def find_duplicates_by_features(features = %Features{}, filter, opts \\ []) do
min_score = Keyword.get(opts, :min_score, 0)
limit = Keyword.get(opts, :limit, 25)

# TODO: many issues with efficient filtering using k-NN plugin,
# use post_filter to work around for the time being
#
# https://github.com/opensearch-project/k-NN/issues/2222
# https://github.com/opensearch-project/k-NN/issues/2339
# https://github.com/opensearch-project/k-NN/issues/2347

query = %{
query: %{
nested: %{
path: "vectors",
query: %{
knn: %{
"vectors.f": %{
vector: features.features,
k: 100
}
}
}
}
},
post_filter: filter,
min_score: min_score
}

images =
Image
|> Search.search_definition(query, %{page_size: limit})
|> Search.search_records(preload(Image, [:user, :sources, tags: :aliases]))

images
|> Map.put(:total_entries, min(images.total_entries, limit))
|> Map.put(:total_pages, min(images.total_pages, 1))
end

@doc """
Executes the reverse image search query from parameters.

## Examples

iex> execute_search_query_by_features(%{"image" => ...})
{:ok, [%Image{...}, ....]}

iex> execute_search_query_by_features(%{"image" => ...})
{:error, %Ecto.Changeset{}}

"""
def execute_search_query_by_features(filter, attrs \\ %{}) do
%SearchQuery{}
|> SearchQuery.changeset(attrs)
|> Uploader.analyze_upload(attrs)
|> Ecto.Changeset.apply_action(:create)
|> case do
{:ok, search_query} ->
images =
search_query
|> generate_features()
|> find_duplicates_by_features(filter, limit: search_query.limit)

{:ok, images}

error ->
error
end
end

@doc """
Query for potential duplicate images based on intensity values and aspect ratio.

Expand All @@ -52,14 +124,14 @@ defmodule Philomena.DuplicateReports do

## Examples

iex> find_duplicates({%{nw: 0.5, ne: 0.5, sw: 0.5, se: 0.5}, 1.0})
iex> find_duplicates_by_intensities({%{nw: 0.5, ne: 0.5, sw: 0.5, se: 0.5}, 1.0})
#Ecto.Query<...>

iex> find_duplicates({intensities, ratio}, dist: 0.3, limit: 20)
iex> find_duplicates_by_intensities({intensities, ratio}, dist: 0.3, limit: 20)
#Ecto.Query<...>

"""
def find_duplicates({intensities, aspect_ratio}, opts \\ []) do
def find_duplicates_by_intensities({intensities, aspect_ratio}, opts \\ []) do
aspect_dist = Keyword.get(opts, :aspect_dist, 0.05)
limit = Keyword.get(opts, :limit, 10)
dist = Keyword.get(opts, :dist, 0.25)
Expand Down Expand Up @@ -100,7 +172,7 @@ defmodule Philomena.DuplicateReports do
{:error, %Ecto.Changeset{}}

"""
def execute_search_query(attrs \\ %{}) do
def execute_search_query_by_intensities(attrs \\ %{}) do
%SearchQuery{}
|> SearchQuery.changeset(attrs)
|> Uploader.analyze_upload(attrs)
Expand All @@ -114,7 +186,7 @@ defmodule Philomena.DuplicateReports do

images =
{intensities, aspect}
|> find_duplicates(dist: dist, aspect_dist: dist, limit: limit)
|> find_duplicates_by_intensities(dist: dist, aspect_dist: dist, limit: limit)
|> preload([:user, :intensity, [:sources, tags: :aliases]])
|> Repo.paginate(page_size: 50)

Expand All @@ -132,6 +204,13 @@ defmodule Philomena.DuplicateReports do
PhilomenaMedia.Processors.intensities(analysis, file)
end

defp generate_features(search_query) do
analysis = SearchQuery.to_analysis(search_query)
file = search_query.uploaded_image

PhilomenaMedia.Processors.features(analysis, file)
end

@doc """
Returns an `%Ecto.Changeset{}` for tracking search query changes.

Expand Down
91 changes: 91 additions & 0 deletions lib/philomena/image_vectors.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
defmodule Philomena.ImageVectors do
@moduledoc """
The ImageVectors context.
"""

import Ecto.Query, warn: false
alias Philomena.Repo

alias Philomena.ImageVectors.ImageVector

@doc """
Gets a single image_vector.

Raises `Ecto.NoResultsError` if the Image vector does not exist.

## Examples

iex> get_image_vector!(123)
%ImageVector{}

iex> get_image_vector!(456)
** (Ecto.NoResultsError)

"""
def get_image_vector!(id), do: Repo.get!(ImageVector, id)

@doc """
Creates a image_vector.

## Examples

iex> create_image_vector(%{field: value})
{:ok, %ImageVector{}}

iex> create_image_vector(%{field: bad_value})
{:error, %Ecto.Changeset{}}

"""
def create_image_vector(image, attrs \\ %PhilomenaMedia.Features{}) do
%ImageVector{image_id: image.id}
|> ImageVector.changeset(Map.from_struct(attrs))
|> Repo.insert()
end

@doc """
Updates a image_vector.

## Examples

iex> update_image_vector(image_vector, %{field: new_value})
{:ok, %ImageVector{}}

iex> update_image_vector(image_vector, %{field: bad_value})
{:error, %Ecto.Changeset{}}

"""
def update_image_vector(%ImageVector{} = image_vector, attrs) do
image_vector
|> ImageVector.changeset(attrs)
|> Repo.update()
end

@doc """
Deletes a image_vector.

## Examples

iex> delete_image_vector(image_vector)
{:ok, %ImageVector{}}

iex> delete_image_vector(image_vector)
{:error, %Ecto.Changeset{}}

"""
def delete_image_vector(%ImageVector{} = image_vector) do
Repo.delete(image_vector)
end

@doc """
Returns an `%Ecto.Changeset{}` for tracking image_vector changes.

## Examples

iex> change_image_vector(image_vector)
%Ecto.Changeset{data: %ImageVector{}}

"""
def change_image_vector(%ImageVector{} = image_vector, attrs \\ %{}) do
ImageVector.changeset(image_vector, attrs)
end
end
88 changes: 88 additions & 0 deletions lib/philomena/image_vectors/batch_processor.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
defmodule Philomena.ImageVectors.BatchProcessor do
@moduledoc """
Batch processing interface for Philomena. See the module documentation
in `m:Philomena.ImageVectors.Importer` for more information about how to
use the functions in this module during maintenance.
"""

alias Philomena.Images
alias Philomena.Images.Image
alias Philomena.Images.Thumbnailer
alias Philomena.ImageVectors.ImageVector
alias Philomena.Maintenance
alias Philomena.Repo

alias PhilomenaMedia.Analyzers
alias PhilomenaMedia.Processors
alias PhilomenaQuery.Batch
alias PhilomenaQuery.Search

alias Philomena.Repo
import Ecto.Query

@spec all_missing(String.t(), Keyword.t()) :: :ok
def all_missing(type \\ "full", opts \\ []) do
Image
|> from(as: :image)
|> where(not exists(where(ImageVector, [iv], iv.image_id == parent_as(:image).id)))
|> by_image_query(type, opts)
end

@spec by_image_query(Ecto.Query.t(), String.t(), Keyword.t()) :: :ok
defp by_image_query(query, type, opts) do
max_concurrency = Keyword.get(opts, :max_concurrency, 4)
min = Repo.one(limit(order_by(query, asc: :id), 1)).id
max = Repo.one(limit(order_by(query, desc: :id), 1)).id

query
|> Batch.query_batches(opts)
|> Task.async_stream(
fn query -> process_query(query, type, opts) end,
timeout: :infinity,
max_concurrency: max_concurrency
)
|> Maintenance.log_progress("BatchProcessor/#{type}", min, max)
end

@spec process_query(Ecto.Query.t(), String.t(), Keyword.t()) ::
Enumerable.t({:ok, integer()})
defp process_query(query, type, batch_opts) do
images = Repo.all(query)
last_id = Enum.max_by(images, & &1.id).id

values =
Enum.flat_map(images, fn image ->
try do
[process_image(image, type)]
rescue
ex ->
IO.puts("While processing #{image.id}: #{inspect(ex)}")
IO.puts(Exception.format_stacktrace(__STACKTRACE__))
[]
end
end)

{_count, nil} = Repo.insert_all(ImageVector, values, on_conflict: :nothing)

:ok =
query
|> preload(^Images.indexing_preloads())
|> Search.reindex(Image, batch_opts)

last_id
end

@spec process_image(%Image{}, String.t()) :: map()
defp process_image(image = %Image{}, type) do
file = Thumbnailer.download_image_file(image)

{:ok, analysis} = Analyzers.analyze_path(file)
features = Processors.features(analysis, file)

%{
image_id: image.id,
type: type,
features: features.features
}
end
end
19 changes: 19 additions & 0 deletions lib/philomena/image_vectors/image_vector.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
defmodule Philomena.ImageVectors.ImageVector do
use Ecto.Schema
import Ecto.Changeset

alias Philomena.Images.Image

schema "image_vectors" do
belongs_to :image, Image
field :type, :string
field :features, {:array, :float}
end

@doc false
def changeset(image_vector, attrs) do
image_vector
|> cast(attrs, [:type, :features])
|> validate_required([:type, :features])
end
end
Loading