Commit d6a37a8c authored by Douwe Maan's avatar Douwe Maan

Merge branch 'bugfix/outlook-language-parsing' into 'master'

Switch to Discourse's EmailReplyTrimmer for more robust reply parsing

See merge request !7473
parents 47b35dde 7218daaa
...@@ -332,7 +332,7 @@ gem 'octokit', '~> 4.3.0' ...@@ -332,7 +332,7 @@ gem 'octokit', '~> 4.3.0'
gem 'mail_room', '~> 0.9.0' gem 'mail_room', '~> 0.9.0'
gem 'email_reply_parser', '~> 0.5.8' gem 'email_reply_trimmer', '~> 0.1'
gem 'html2text' gem 'html2text'
gem 'ruby-prof', '~> 0.16.2' gem 'ruby-prof', '~> 0.16.2'
......
...@@ -167,7 +167,7 @@ GEM ...@@ -167,7 +167,7 @@ GEM
railties (>= 4.2) railties (>= 4.2)
dropzonejs-rails (0.7.2) dropzonejs-rails (0.7.2)
rails (> 3.1) rails (> 3.1)
email_reply_parser (0.5.8) email_reply_trimmer (0.1.6)
email_spec (1.6.0) email_spec (1.6.0)
launchy (~> 2.1) launchy (~> 2.1)
mail (~> 2.2) mail (~> 2.2)
...@@ -839,7 +839,7 @@ DEPENDENCIES ...@@ -839,7 +839,7 @@ DEPENDENCIES
diffy (~> 3.1.0) diffy (~> 3.1.0)
doorkeeper (~> 4.2.0) doorkeeper (~> 4.2.0)
dropzonejs-rails (~> 0.7.1) dropzonejs-rails (~> 0.7.1)
email_reply_parser (~> 0.5.8) email_reply_trimmer (~> 0.1)
email_spec (~> 1.6.0) email_spec (~> 1.6.0)
factory_girl_rails (~> 4.7.0) factory_girl_rails (~> 4.7.0)
ffaker (~> 2.0.0) ffaker (~> 2.0.0)
......
...@@ -13,9 +13,17 @@ module Gitlab ...@@ -13,9 +13,17 @@ module Gitlab
encoding = body.encoding encoding = body.encoding
body = discourse_email_trimmer(body) body = EmailReplyTrimmer.trim(body)
body = EmailReplyParser.parse_reply(body) return '' unless body
# not using /\s+$/ here because that deletes empty lines
body = body.gsub(/[ \t]$/, '')
# NOTE: We currently don't support empty quotes.
# EmailReplyTrimmer allows this as a special case,
# so we detect it manually here.
return "" if body.lines.all? { |l| l.strip.empty? || l.start_with?('>') }
body.force_encoding(encoding).encode("UTF-8") body.force_encoding(encoding).encode("UTF-8")
end end
...@@ -57,30 +65,6 @@ module Gitlab ...@@ -57,30 +65,6 @@ module Gitlab
rescue rescue
nil nil
end end
REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })
def discourse_email_trimmer(body)
lines = body.scrub.lines.to_a
range_end = 0
lines.each_with_index do |l, idx|
# This one might be controversial but so many reply lines have years, times and end with a colon.
# Let's try it and see how well it works.
break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
(l =~ /On \w+ \d+,? \d+,?.*wrote:/)
# Headers on subsequent lines
break if (0..2).all? { |off| lines[idx + off] =~ REPLYING_HEADER_REGEX }
# Headers on the same line
break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3
range_end = idx
end
lines[0..range_end].join.strip
end
end end
end end
end end
...@@ -88,8 +88,6 @@ describe Gitlab::Email::ReplyParser, lib: true do ...@@ -88,8 +88,6 @@ describe Gitlab::Email::ReplyParser, lib: true do
expect(test_parse_body(fixture_file("emails/inline_reply.eml"))). expect(test_parse_body(fixture_file("emails/inline_reply.eml"))).
to eq( to eq(
<<-BODY.strip_heredoc.chomp <<-BODY.strip_heredoc.chomp
On Wed, Oct 8, 2014 at 11:12 AM, techAPJ <info@unconfigured.discourse.org> wrote:
> techAPJ <https://meta.discourse.org/users/techapj> > techAPJ <https://meta.discourse.org/users/techapj>
> November 28 > November 28
> >
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment