Skip to content

Commit

Permalink
Feature: custom ALPN - better header generation and proxy handling (#20)
Browse files Browse the repository at this point in the history
* Feature: custom ALPN - better header generation and proxy handling

* chore: Bumped version. Updated changelog.
  • Loading branch information
petrpatek authored May 17, 2021
1 parent 482e068 commit 2badb20
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 10 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
1.0.4 / 2021/05/17
====================
- HTTP2 protocol resolving fix

1.0.3 / 2021/04/27
====================
- HTTP2 wrapper fix
Expand Down
2 changes: 1 addition & 1 deletion jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ module.exports = {
rootDir: path.join(__dirname, './'),
testMatch: ['**/?(*.)+(spec|test).[tj]s?(x)'],
setupFilesAfterEnv: ['jest-extended'],
timeout: 10e3,
testTimeout: 10e3,
};
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "got-scraping",
"version": "1.0.3",
"version": "1.0.4",
"description": "HTTP client made for scraping based on got.",
"main": "src/index.js",
"files": [
Expand Down
29 changes: 29 additions & 0 deletions src/handlers/alpn.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
const httpResolver = require('../http-resolver');

/**
* @param {object} options
* @param {function} next
* @returns {import('got').GotReturn}
*/
async function alpnHandler(options, next) {
const { url, http2 } = options;

if (http2) {
const parsedUrl = new URL(url);

if (parsedUrl.protocol === 'https:') {
const protocol = await httpResolver.resolveHttpVersion(parsedUrl);

options.http2 = protocol === 'h2';
} else {
// http2 is https
options.http2 = false;
}
}

return next(options);
}

module.exports = {
alpnHandler,
};
4 changes: 3 additions & 1 deletion src/handlers/browser-headers.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ function createOptionsWithBeforeRequestHook(generatedHeaders, headerOverrides) {
hooks: {
beforeRequest: [
(gotOptions) => {
gotOptions.headers = mergeHeaders(generatedHeaders, headerOverrides);
const mergedOriginalHeaders = mergeHeaders(generatedHeaders, gotOptions.headers);

gotOptions.headers = mergeHeaders(mergedOriginalHeaders, headerOverrides);
},
],
},
Expand Down
2 changes: 1 addition & 1 deletion src/http-resolver.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class HttpResolver {
const result = await http2.auto.resolveProtocol({
host: hostname,
servername: hostname,
port,
port: port || 443,
ALPNProtocols: ['h2', 'http/1.1'],
rejectUnauthorized,
});
Expand Down
4 changes: 4 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const { optionsValidationHandler } = require('./handlers/options-validation');
const { customOptionsHandler } = require('./handlers/custom-options');
const { browserHeadersHandler } = require('./handlers/browser-headers');
const { proxyHandler } = require('./handlers/proxy');
const { alpnHandler } = require('./handlers/alpn');

const isResponseOk = (response) => {
const { statusCode } = response;
Expand Down Expand Up @@ -46,6 +47,9 @@ const gotScraping = got.extend(
handlers: [
optionsValidationHandler,
customOptionsHandler,
// ALPN negotiation is handled by got (http2-wrapper) by default.
// However, its caching is causing problems with http proxies and https targets on http 1.1
alpnHandler,
proxyHandler,
browserHeadersHandler,
],
Expand Down
43 changes: 43 additions & 0 deletions test/alpn.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
const { alpnHandler } = require('../src/handlers/alpn');
const httpResolver = require('../src/http-resolver');

describe('ALPN', () => {
let next;
let options;

beforeEach(() => {
options = {
context: {},
https: {},
};
next = () => { };
});

afterEach(() => {
jest.clearAllMocks();
});

test('should start alpn only if http2 and https', async () => {
options.url = 'https://test.com';
options.http2 = true;
jest.spyOn(httpResolver, 'resolveHttpVersion').mockResolvedValue('h2');

await alpnHandler(options, next);
expect(options.http2).toBe(true);

jest.spyOn(httpResolver, 'resolveHttpVersion').mockResolvedValue('http/1.1');

await alpnHandler(options, next);
expect(options.http2).toBe(false);
});

test('should skip alpn and assume http/1.1 if not https', async () => {
options.url = 'http://test.com';
options.http2 = true;
jest.spyOn(httpResolver, 'resolveHttpVersion');

await alpnHandler(options, next);
expect(httpResolver.resolveHttpVersion).toBeCalledTimes(0);
expect(options.http2).toBe(false);
});
});
29 changes: 25 additions & 4 deletions test/main.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ describe('GotScraping', () => {
});

test('should use all handlers', async () => {
expect(gotScraping.defaults.handlers).toHaveLength(4);
expect(gotScraping.defaults.handlers).toHaveLength(5);
});

test('should allow passing custom properties', async () => {
Expand All @@ -41,7 +41,7 @@ describe('GotScraping', () => {
expect(options.context.headerGeneratorOptions).toMatchObject(requestOptions.headerGeneratorOptions);
});

test('should allow overrding generated options using handlers', async () => {
test('should allow overriding generated options using handlers', async () => {
const requestOptions = {
url: `http://localhost:${port}/html`,
};
Expand Down Expand Up @@ -75,9 +75,9 @@ describe('GotScraping', () => {

expect(response.statusCode).toBe(200);
expect(response.request.options).toMatchObject({
http2: true,
http2: false,
headers: {
'user-agent': 'test',
'User-Agent': 'test',
},
});
});
Expand Down Expand Up @@ -128,6 +128,27 @@ describe('GotScraping', () => {
expect(response.statusCode).toBe(200);
expect(response.httpVersion).toBe('2.0');
});

test('Should auto downgrade protocol', async () => {
const response = await gotScraping({ url: 'https://eshop.coop-box.cz/' });
expect(response.statusCode).toBe(200);
expect(response.httpVersion).toBe('1.1');
expect(response.request.options.headers.Accept).toBeDefined(); // capitalized headers are proof
});

if (nodeVersion >= 12) {
test('Should allow https target via http proxy when auto downgrading', async () => {
const response = await gotScraping({
url: 'https://eshop.coop-box.cz/',
proxyUrl: `http://groups-SHADER,session-123:${process.env.APIFY_PROXY_PASSWORD}@proxy.apify.com:8000`,

});
expect(response.statusCode).toBe(200);
expect(response.httpVersion).toBe('1.1');
expect(response.request.options.headers.Accept).toBeDefined(); // capitalized headers are proof
});
}

test('should work with proxyUrl and http1', async () => {
const response = await gotScraping({
responseType: 'json',
Expand Down
6 changes: 5 additions & 1 deletion test/proxy.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ describe('Proxy', () => {
context: {},
https: {},
};
next = () => {};
next = () => { };
});

afterEach(() => {
jest.clearAllMocks();
});

test('should modify agents only if proxyUrl provided', async () => {
Expand Down
2 changes: 1 addition & 1 deletion test/scraping-defaults.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ describe('Scraping defaults', () => {
const { useHeaderGenerator, timeout, ...gotDefaults } = SCRAPING_DEFAULT_OPTIONS;

const response = await gotScraping.get(`http://localhost:${port}/html`);
expect(response.request.options).toMatchObject({ ...gotDefaults, timeout: { request: timeout } });
expect(response.request.options).toMatchObject({ ...gotDefaults, http2: false, timeout: { request: timeout } });
expect(response.request.options.context).toMatchObject({ useHeaderGenerator });
});

Expand Down

0 comments on commit 2badb20

Please sign in to comment.