Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions lib/gscraper/google/http_client.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
defmodule Gscraper.Google.HttpClient do
@google_search_base_url "https://www.google.com"

def get(path, query_params \\ %{}, headers \\ []) do
client()
|> Tesla.get(path, query: Map.to_list(query_params), header: headers)
|> handle_response()
end

defp client do
middleware = [
{Tesla.Middleware.BaseUrl, @google_search_base_url}
]

adapter = {Tesla.Adapter.Hackney, []}

Tesla.client(middleware, adapter)
end

defp handle_response({:ok, %{status: status, body: body}}) when status in 200..299,
do: {:ok, body}

defp handle_response({:ok, %{status: status, body: body}}),
do: {:error, %{status: status, body: body}}

defp handle_response({:error, reason}), do: {:error, reason}
end
3 changes: 2 additions & 1 deletion lib/gscraper/search/schemas/keyword.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ defmodule Gscraper.Search.Schemas.Keyword do
use Ecto.Schema
import Ecto.Changeset

alias Gscraper.Account.Schemas.User
alias Gscraper.Account.Schemas.{Report, User}

schema "keywords" do
field :keyword, :string
field :status, Ecto.Enum, values: [:pending, :failed, :completed]

belongs_to :user, User
has_one :report, Report

timestamps()
end
Expand Down
30 changes: 30 additions & 0 deletions lib/gscraper/search/schemas/report.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
defmodule Gscraper.Search.Schemas.Report do
use Ecto.Schema
import Ecto.Changeset

alias Gscraper.Search.Schemas.Keyword

# credo:disable-for-next-line Credo.Check.Readability.MaxLineLength
@fields ~w(ads_count top_ads_count top_ads_urls organic_result_count organic_urls links_count raw_html)

schema "keywords" do
field :ads_count, :integer
field :top_ads_count, :integer
field :top_ads_urls, {:array, :string}
field :organic_result_count, :integer
field :organic_urls, {:array, :string}
field :links_count, :integer
field :raw_html, :string

belongs_to :keyword, Keyword

timestamps()
end

def create_changeset(report \\ %__MODULE__{}, attrs) do
report
|> cast(attrs, @fields)
|> validate_required(@fields)
|> assoc_constraint(:keyword)
end
end
36 changes: 36 additions & 0 deletions lib/gscraper/search/scraper.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
defmodule Gscraper.Search.Scraper do
alias Gscraper.Google.HttpClient, as: GoogleClient
alias Gscraper.Search.SearchResultParser

# credo:disable-for-lines:7 Credo.Check.Readability.MaxLineLength
@user_agents [
'Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/98.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15'
]

def scrape(keyword) do
with {:ok, raw_html} <- search_keyword(keyword),
{:ok, parsed_result} <- SearchResultParser.parse(raw_html) do
{:ok, parsed_result}
else
{:error, :failed_to_parse, reason} ->
{:error, :failed_to_parse, "Failed to parse search result: #{inspect(reason)}"}

{:error, reason} ->
{:error, :http_client_error, "Search page cannot be fetched: #{inspect(reason)}"}
end
end

defp search_keyword(keyword) do
query_params = %{q: "#{keyword}"}
headers = ["User-Agent": random_user_agent()]

GoogleClient.get("search", query_params, headers)
end

defp random_user_agent, do: @user_agents |> Enum.shuffle() |> Enum.take(1)
end
67 changes: 67 additions & 0 deletions lib/gscraper/search/search_result_parser.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
defmodule Gscraper.Search.SearchResultParser do
@top_ads_selector "#tads .uEierd"
@top_ads_link_selector "#tads a"
@sidebar_ads_selector ".Yi78Pd .pla-unit"
@organic_link_selector "#search .g"
@organic_result_selector "#search .g a"
@href_attribute "a[href]"

def parse(raw_html) do
case Floki.parse_document(raw_html) do
{:ok, parsed_html} ->
results = %{
ads_count: get_ads_count(parsed_html),
top_ads_count: get_top_ads_count(parsed_html),
top_ads_urls: get_top_ads_urls(parsed_html),
organic_result_count: get_organic_result_count(parsed_html),
organic_urls: get_organic_urls(parsed_html),
link_count: get_link_count(parsed_html),
raw_html: raw_html
}

{:ok, results}

{:error, reason} ->
{:error, :failed_to_parse, reason}
end
end

defp get_ads_count(parsed_html),
do: get_sidebar_ads_count(parsed_html) + get_top_ads_count(parsed_html)

defp get_sidebar_ads_count(parsed_html) do
parsed_html
|> Floki.find(@sidebar_ads_selector)
|> Enum.count()
end

defp get_top_ads_count(parsed_html) do
parsed_html
|> Floki.find(@top_ads_selector)
|> Enum.count()
end

defp get_top_ads_urls(parsed_html) do
parsed_html
|> Floki.find(@top_ads_link_selector)
|> Floki.attribute(@href_attribute)
end

defp get_organic_result_count(parsed_html) do
parsed_html
|> Floki.find(@organic_link_selector)
|> Enum.count()
end

defp get_organic_urls(parsed_html) do
parsed_html
|> Floki.find(@organic_result_selector)
|> Floki.attribute(@href_attribute)
end

defp get_link_count(parsed_html) do
parsed_html
|> Floki.find(@organic_link_selector)
|> Enum.count()
end
end
29 changes: 22 additions & 7 deletions lib/gscraper/search/searches.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ defmodule Gscraper.Search.Searches do
alias Gscraper.Account.Schemas.User
alias Gscraper.Repo
alias Gscraper.Search.Queries.KeywordQuery
alias Gscraper.Search.Schemas.Keyword
alias Gscraper.Search.Schemas.{Keyword, Report}
alias Gscraper.Search.ScraperWorker

def find_keyword_by_id(id), do: Repo.get(Keyword, id)

def list_keywords_by_user(user) do
user
Expand All @@ -18,15 +21,27 @@ defmodule Gscraper.Search.Searches do

def process_keyword_list(keyword_list, %User{id: user_id}) do
Enum.each(keyword_list, fn keyword ->
create_params = %{
user_id: user_id,
keyword: keyword
}
create_params = %{user_id: user_id, keyword: keyword}

Ecto.Multi.new()
|> Ecto.Multi.run(:keyword, fn _, _ -> create_keyword(create_params) end)
# Todo: enqueue a new search job for the given keyword
|> Ecto.Multi.run(:create_keyword, fn _, _ -> create_keyword(create_params) end)
|> Ecto.Multi.run(:enqueue_search_job, fn _, %{keyword: keyword} ->
enqueue_search_job(keyword)
end)
|> Repo.transaction()
end)
end

def create_report(%Keyword{} = keyword, attrs \\ %{}) do
keyword
|> Ecto.build_assoc(:report)
|> Report.create_changeset(attrs)
|> Repo.insert()
end

defp enqueue_search_job(%Keyword{id: keyword_id}) do
%{keyword_id: keyword_id}
|> ScraperWorker.new()
|> Oban.insert()
end
end
24 changes: 24 additions & 0 deletions lib/gscraper/search/worker/scraper_worker.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
defmodule Gscraper.Search.ScraperWorker do
use Oban.Worker, max_attempts: 10, unique: [period: 30]

alias Gscraper.Search.Schemas.Keyword
alias Gscraper.Search.{Scraper, Searches}

@impl Oban.Worker
def perform(%Oban.Job{args: %{"keyword_id" => keyword_id}}) do
with {%Keyword{} = keyword} <- Searches.find_keyword_by_id(keyword_id),
{:ok, parsed_result} <- Scraper.scrape(keyword),
{:ok, _} <- Searches.create_report(keyword, parsed_result) do
:ok
else
{:error, :not_found} ->
{:error, "Keyword not found for ID: #{keyword_id}"}

{:error, :failed_to_parse, reason} ->
{:error, reason}

{:error, :http_client_error, reason} ->
{:error, reason}
end
end
end
7 changes: 5 additions & 2 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ defmodule Gscraper.MixProject do
defp deps do
[
{:argon2_elixir, "~> 2.0"},
{:wallaby, "~> 0.28.0", [only: :test, runtime: false]},
{:wallaby, "~> 0.29.1", [only: :test, runtime: false]},
{:sobelow, "~> 0.11.1", [only: [:dev, :test], runtime: false]},
{:oban, "~> 2.6.1"},
{:mimic, "~> 1.4.0", [only: :test]},
{:ex_machina, "~> 2.7.0", [only: :test]},
{:excoveralls, "~> 0.14.0", [only: :test]},
{:faker, "~> 0.16", only: :test},
{:floki, "~> 0.33.0"},
{:dialyxir, "~> 1.1.0", [only: [:dev], runtime: false]},
{:credo, "~> 1.5.6", [only: [:dev, :test], runtime: false]},
{:phoenix, "~> 1.5.12"},
Expand All @@ -63,7 +64,9 @@ defmodule Gscraper.MixProject do
{:jason, "~> 1.0"},
{:plug_cowboy, "~> 2.5"},
{:guardian, "~> 2.0"},
{:nimble_csv, "~> 1.1"}
{:nimble_csv, "~> 1.1"},
{:tesla, "~> 1.4"},
{:hackney, "~> 1.17"}
]
end

Expand Down
Loading