Skip to content

Commit db58ec8

Browse files
committed
try to match pending messages by content
as gmail api changes the message id
1 parent d87a5d3 commit db58ec8

2 files changed

Lines changed: 163 additions & 0 deletions

File tree

app/services/email_ingestor.rb

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ def ingest_raw(raw_message, mailing_list:, fallback_threading: false, trust_date
2020
return existing_message
2121
end
2222

23+
pending_echo = find_pending_echo(
24+
reply_to_message_id: reply_to_message_id,
25+
from_addresses: extract_from_emails(m),
26+
echo_body: body,
27+
echo_sent_at: sent_at
28+
)
29+
if pending_echo
30+
pending_echo.update_columns(message_id: message_id) if pending_echo.message_id != message_id
31+
update_existing_message(pending_echo, body: body, sent_at: sent_at,
32+
reply_to_message_id: reply_to_message_id, update_existing: update_existing)
33+
associate_mailing_list(pending_echo, mailing_list)
34+
return pending_echo
35+
end
36+
2337
import_log = ""
2438

2539
from = build_from_aliases(m, sent_at)
@@ -263,6 +277,53 @@ def clean_reference(ref)
263277
MessageIdNormalizer.normalize(ref)
264278
end
265279

280+
# Gmail rewrites Message-IDs on send, so the echo from the list has a different
281+
# id than the pending row we created. Match heuristically on parent reference,
282+
# sender, body fragment, and time window.
283+
PENDING_ECHO_WINDOW = 24.hours
284+
PENDING_ECHO_MIN_BODY = 20
285+
286+
def find_pending_echo(reply_to_message_id:, from_addresses:, echo_body:, echo_sent_at:)
287+
return nil if from_addresses.empty?
288+
289+
scope = Message.pending
290+
scope = scope.where(reply_to_message_id: reply_to_message_id) if reply_to_message_id.present?
291+
292+
reference_time = echo_sent_at || Time.current
293+
scope = scope.where(sent_at: (reference_time - PENDING_ECHO_WINDOW)..(reference_time + PENDING_ECHO_WINDOW))
294+
295+
sender_alias_ids = Alias.where("LOWER(email) IN (?)", from_addresses).pluck(:id)
296+
return nil if sender_alias_ids.empty?
297+
scope = scope.where(sender_id: sender_alias_ids)
298+
299+
candidates = scope.to_a
300+
return nil if candidates.empty?
301+
302+
echo_normalized = normalize_for_body_match(echo_body)
303+
return nil if echo_normalized.blank?
304+
305+
matches = candidates.select do |c|
306+
pending_normalized = normalize_for_body_match(c.body)
307+
next false if pending_normalized.length < PENDING_ECHO_MIN_BODY
308+
echo_normalized.include?(pending_normalized)
309+
end
310+
311+
matches.size == 1 ? matches.first : nil
312+
end
313+
314+
def normalize_for_body_match(text)
315+
text.to_s.gsub(/\s+/, " ").strip
316+
end
317+
318+
def extract_from_emails(mail)
319+
addrs = mail.from
320+
return [] unless addrs
321+
addrs = [ addrs ] if addrs.is_a?(String)
322+
addrs.compact.map { |a| a.to_s.downcase.strip }.reject(&:blank?)
323+
rescue NoMethodError
324+
[]
325+
end
326+
266327
def update_default_alias_for_person(alias_record)
267328
return unless alias_record&.person
268329
person = alias_record.person

spec/services/email_ingestor_spec.rb

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,108 @@
265265
end
266266
end
267267

268+
describe "pending echo with Gmail-rewritten Message-ID" do
269+
let(:user) { create(:user) }
270+
let(:sender) { create(:alias, user: user, email: "sender@example.com", name: "Sender") }
271+
let(:list) { create(:mailing_list) }
272+
let(:parent_id) { "parent-thread@example.com" }
273+
let(:topic) { create(:topic, creator: sender) }
274+
let!(:parent_msg) {
275+
Message.create!(
276+
topic: topic, sender: sender, sender_person_id: sender.person_id,
277+
subject: "Topic", body: "parent body",
278+
message_id: parent_id, state: Message::STATE_SENT,
279+
created_at: 1.hour.ago
280+
)
281+
}
282+
let(:pending_body) { "Hello, this is my reply body with enough length to match." }
283+
let!(:pending) {
284+
Message.create!(
285+
topic: topic, sender: sender, sender_person_id: sender.person_id,
286+
reply_to: parent_msg, reply_to_message_id: parent_id,
287+
subject: "Re: Topic", body: pending_body,
288+
message_id: "<placeholder-uuid@hackorum.dev>",
289+
state: Message::STATE_PENDING,
290+
sent_at: 30.seconds.ago,
291+
created_at: 30.seconds.ago
292+
)
293+
}
294+
295+
let(:gmail_echo_id) { "CABx_real_gmail_id@mail.gmail.com" }
296+
let(:echo_body) { "#{pending_body}\n\n-- \nList footer goes here" }
297+
let(:raw_echo) {
298+
<<~EML
299+
From: Sender <sender@example.com>
300+
To: pgsql-test@example.com
301+
Subject: [PGSQL] Re: Topic
302+
Message-ID: <#{gmail_echo_id}>
303+
In-Reply-To: <#{parent_id}>
304+
References: <#{parent_id}>
305+
Date: #{Time.current.rfc2822}
306+
307+
#{echo_body}
308+
EML
309+
}
310+
311+
it "flips pending to sent and rewrites message_id" do
312+
described_class.new.ingest_raw(raw_echo, mailing_list: list)
313+
pending.reload
314+
expect(pending.state).to eq(Message::STATE_SENT)
315+
expect(pending.message_id).to eq(gmail_echo_id)
316+
end
317+
318+
it "does not create a duplicate message" do
319+
expect {
320+
described_class.new.ingest_raw(raw_echo, mailing_list: list)
321+
}.not_to change { Message.count }
322+
end
323+
324+
it "associates the mailing list with the pending row" do
325+
described_class.new.ingest_raw(raw_echo, mailing_list: list)
326+
expect(pending.message_mailing_lists.where(mailing_list: list)).to exist
327+
end
328+
329+
it "leaves pending intact when echo body does not contain pending body" do
330+
raw = raw_echo.sub(echo_body, "completely unrelated content here that is long enough")
331+
expect {
332+
described_class.new.ingest_raw(raw, mailing_list: list)
333+
}.to change { Message.count }.by(1)
334+
expect(pending.reload.state).to eq(Message::STATE_PENDING)
335+
end
336+
337+
it "skips heuristic when sender email differs" do
338+
raw = raw_echo.sub("sender@example.com", "someone-else@example.com")
339+
expect {
340+
described_class.new.ingest_raw(raw, mailing_list: list)
341+
}.to change { Message.count }.by(1)
342+
expect(pending.reload.state).to eq(Message::STATE_PENDING)
343+
end
344+
345+
it "skips heuristic outside the time window" do
346+
pending.update_columns(sent_at: 3.days.ago, created_at: 3.days.ago)
347+
expect {
348+
described_class.new.ingest_raw(raw_echo, mailing_list: list)
349+
}.to change { Message.count }.by(1)
350+
expect(pending.reload.state).to eq(Message::STATE_PENDING)
351+
end
352+
353+
it "skips heuristic when two pending rows match ambiguously" do
354+
Message.create!(
355+
topic: topic, sender: sender, sender_person_id: sender.person_id,
356+
reply_to: parent_msg, reply_to_message_id: parent_id,
357+
subject: "Re: Topic", body: pending_body,
358+
message_id: "<another-placeholder@hackorum.dev>",
359+
state: Message::STATE_PENDING,
360+
sent_at: 20.seconds.ago,
361+
created_at: 20.seconds.ago
362+
)
363+
expect {
364+
described_class.new.ingest_raw(raw_echo, mailing_list: list)
365+
}.to change { Message.count }.by(1)
366+
expect(pending.reload.state).to eq(Message::STATE_PENDING)
367+
end
368+
end
369+
268370
describe "#fallback_thread_lookup scoped to list" do
269371
let(:ingestor) { described_class.new }
270372
let(:hackers_list) { create(:mailing_list, identifier: "pgsql-hackers", display_name: "hackers") }

0 commit comments

Comments
 (0)