Commit be4fe7aa authored by Thong Kuah's avatar Thong Kuah

Merge branch 'regression_test_project_homepage_robots_txt' into 'master'

Test that project homepage is accessible by crawlers

See merge request gitlab-org/gitlab!42180
parents 882d8bc6 dbf1b5af
require 'gitlab/testing/request_blocker_middleware'
require 'gitlab/testing/robots_blocker_middleware'
require 'gitlab/testing/request_inspector_middleware'
require 'gitlab/testing/clear_process_memory_cache_middleware'
require 'gitlab/utils'
......@@ -6,6 +7,7 @@ require 'gitlab/utils'
Rails.application.configure do
# Make sure the middleware is inserted first in middleware chain
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::RequestBlockerMiddleware)
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::RobotsBlockerMiddleware)
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::RequestInspectorMiddleware)
config.middleware.insert_before(ActionDispatch::Static, Gitlab::Testing::ClearProcessMemoryCacheMiddleware)
......
# frozen_string_literal: true
module Gitlab
module RobotsTxt
def self.disallowed?(path)
parsed_robots_txt.disallowed?(path)
end
def self.parsed_robots_txt
@parsed_robots_txt ||= Parser.new(robots_txt)
end
def self.robots_txt
File.read(Rails.root.join('public', 'robots.txt'))
end
end
end
# frozen_string_literal: true
module Gitlab
module RobotsTxt
class Parser
attr_reader :disallow_rules
def initialize(content)
@raw_content = content
@disallow_rules = parse_raw_content!
end
def disallowed?(path)
disallow_rules.any? { |rule| path =~ rule }
end
private
# This parser is very basic as it only knows about `Disallow:` lines,
# and simply ignores all other lines.
#
# Order of predecence, 'Allow:`, etc are ignored for now.
def parse_raw_content!
@raw_content.each_line.map do |line|
if line.start_with?('Disallow:')
value = line.sub('Disallow:', '').strip
value = Regexp.escape(value).gsub('\*', '.*')
Regexp.new("^#{value}")
else
nil
end
end.compact
end
end
end
end
# frozen_string_literal: true
# rubocop:disable Style/ClassVars
module Gitlab
module Testing
class RobotsBlockerMiddleware
@@block_requests = Concurrent::AtomicBoolean.new(false)
# Block requests according to robots.txt.
# Any new requests disallowed by robots.txt will return an HTTP 503 status.
def self.block_requests!
@@block_requests.value = true
end
# Allows the server to accept requests again.
def self.allow_requests!
@@block_requests.value = false
end
def initialize(app)
@app = app
end
def call(env)
request = Rack::Request.new(env)
if block_requests? && Gitlab::RobotsTxt.disallowed?(request.path_info)
block_request(env)
else
@app.call(env)
end
end
private
def block_requests?
@@block_requests.true?
end
def block_request(env)
[503, {}, []]
end
end
end
end
......@@ -14,4 +14,25 @@ RSpec.describe 'Projects > Show > User sees README' do
expect(page).to have_content 'testme'
end
end
context 'obeying robots.txt' do
before do
Gitlab::Testing::RobotsBlockerMiddleware.block_requests!
end
after do
Gitlab::Testing::RobotsBlockerMiddleware.allow_requests!
end
# For example, see this regression we had in
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/39520
it 'does not block the requests necessary to load the project README', :js do
visit project_path(project)
wait_for_requests
page.within('.readme-holder') do
expect(page).to have_content 'testme'
end
end
end
end
# frozen_string_literal: true
require 'fast_spec_helper'
require 'rspec-parameterized'
RSpec.describe Gitlab::RobotsTxt::Parser do
describe '#disallowed?' do
subject { described_class.new(content).disallowed?(path) }
context 'a simple robots.txt file' do
using RSpec::Parameterized::TableSyntax
let(:content) do
<<~TXT
User-Agent: *
Disallow: /autocomplete/users
Disallow: /search
Disallow: /api
TXT
end
where(:path, :result) do
'/autocomplete/users' | true
'/autocomplete/users/a.html' | true
'/search' | true
'/search.html' | true
'/api' | true
'/api/grapql' | true
'/api/index.html' | true
'/projects' | false
end
with_them do
it { is_expected.to eq(result), "#{path} expected to be #{result}" }
end
end
context 'robots.txt file with wildcard' do
using RSpec::Parameterized::TableSyntax
let(:content) do
<<~TXT
User-Agent: *
Disallow: /search
User-Agent: *
Disallow: /*/*.git
Disallow: /*/archive/
Disallow: /*/repository/archive*
TXT
end
where(:path, :result) do
'/search' | true
'/namespace/project.git' | true
'/project/archive/' | true
'/project/archive/file.gz' | true
'/project/repository/archive' | true
'/project/repository/archive.gz' | true
'/project/repository/archive/file.gz' | true
'/projects' | false
'/git' | false
'/projects/git' | false
end
with_them do
it { is_expected.to eq(result), "#{path} expected to be #{result}" }
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment