diff --git a/lib/index.js b/lib/index.js index 895bf98..0374646 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,15 +1,12 @@ const AssetGraph = require('assetgraph'); const async = require('async'); -const request = require('request'); const version = require('../package.json').version; const relationDebugDescription = require('./relationDebugDescription'); const prettyBytes = require('pretty-bytes'); const net = require('net'); const tls = require('tls'); -const defaultSkipFilters = [ - require('./known-culprits/linkedin') -]; +const defaultSkipFilters = [require('./known-culprits/linkedin')]; const hyperlinkUserAgent = `Hyperlink v${version} (https://www.npmjs.com/package/hyperlink)`; @@ -161,184 +158,6 @@ async function hyperlink( }; } - function httpStatus(asset, attempt = 1) { - const url = asset.url; - const relations = asset._incoming; - - const loadReport = { - operator: 'external-check', - name: `external-check ${url}`, - at: [...new Set(relations.map(r => r.debugDescription))].join( - '\n ' - ), - expected: `200 ${url}` - }; - - return callback => { - if (shouldSkip(loadReport)) { - return setTimeout(callback); - } - - request( - { - method: attempt === 1 ? 'head' : 'get', - url: asset.url, - strictSSL: true, - gzip: true, - headers: { - 'User-Agent': hyperlinkUserAgent, - Accept: - 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, sdch, br' - } - }, - (error, res) => { - if (error) { - const code = error.code; - let actual = code || 'Unknown error'; - - switch (code) { - case 'ENOTFOUND': - actual = `DNS missing: ${asset.hostname}`; - break; - case 'HPE_INVALID_CONSTANT': - if (attempt === 1) { - return httpStatus(asset, attempt + 1)(callback); - } - break; - } - - reportTest({ - ...loadReport, - ok: false, - actual - }); - - return callback(); - } - - const status = res.statusCode; - - if (status >= 200 && status < 300) { - const contentType = res.headers['content-type']; - if (contentType && asset.type) { - const matchContentType = contentType.match( - /^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i - ); - if (matchContentType && asset.expectedTypes) { - asset.contentType = matchContentType[1].toLowerCase(); - asset._tryUpgrade(); - } - } else if (!contentType) { - const contentTypeMisingReport = { - ok: false, - name: `content-type-missing ${asset.urlOrDescription}`, - operator: 'content-type-missing', - expected: - asset.contentType || - `A Content-Type compatible with ${asset.type}`, - actual: contentType, - at: [...new Set(relations.map(r => r.debugDescription))].join( - '\n ' - ) - }; - - if (!shouldSkip(contentTypeMisingReport)) { - reportTest(contentTypeMisingReport); - } - } - } - - // Some servers respond weirdly to HEAD requests. Make a second attempt with GET - if (attempt === 1 && status >= 400 && status < 600) { - return httpStatus(asset, attempt + 1)(callback); - } - - // Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down - if (attempt === 2 && status === 502) { - setTimeout(() => httpStatus(asset, attempt + 1)(callback), 1000); - return; - } - - const redirects = res.request._redirect.redirects; - if (redirects.length > 0) { - const log = [{ redirectUri: url }, ...redirects].map( - (item, idx, arr) => { - if (arr[idx + 1]) { - item.statusCode = arr[idx + 1].statusCode; - } else { - item.statusCode = 200; - } - - return item; - } - ); - - const redirectReport = { - operator: 'external-redirect', - name: `external-redirect ${url}`, - at: [...new Set(relations.map(r => r.debugDescription))].join( - '\n ' - ), - expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}` - }; - - const actual = log - .map(redirect => `${redirect.statusCode} ${redirect.redirectUri}`) - .join(' --> '); - - if (!shouldSkip(redirectReport)) { - // A single temporary redirect is allowed - if ([302, 307].includes(log[0].statusCode)) { - if (log.length < 3) { - reportTest({ - ...redirectReport, - expected: actual, - actual, - ok: true - }); - } else { - reportTest({ - ...redirectReport, - expected: `${log[0].statusCode} ${url} --> 200 ${ - log[log.length - 1].redirectUri - }`, - actual, - ok: false - }); - } - } else { - reportTest({ - ...redirectReport, - actual, - ok: false - }); - } - } - } - - if (status === 200) { - reportTest({ - ...loadReport, - ok: true, - actual: loadReport.expected - }); - - return callback(); - } - - reportTest({ - ...loadReport, - actual: `${status} ${url}`, - ok: false - }); - - return callback(); - } - ); - }; - } - if (verbose) { ag.on('addRelation', relation => { console.error('addRelation', relation.toString()); @@ -349,16 +168,22 @@ async function hyperlink( } function handleError(error) { - // Explicitly handle incompatible types warning + // Detect and upgrade certain errors from AssetGraph: + let operator; if (error.stack && error.stack.includes('_warnIncompatibleTypes')) { + operator = 'content-type-mismatch'; + } else if (error.message === 'No Content-Type response header received') { + operator = 'content-type-missing'; + } + if (operator) { const asset = error.asset; const expected = asset.contentType || `A Content-Type compatible with ${asset.type}`; const contentTypeMismatchReport = { ok: false, - operator: 'content-type-mismatch', - name: `content-type-mismatch ${asset.urlOrDescription}`, + operator, + name: `${operator} ${asset.urlOrDescription}`, expected, actual: error.message, at: [...new Set(asset._incoming.map(r => r.debugDescription))].join( @@ -372,7 +197,6 @@ async function hyperlink( return; } - const message = error.message || error; const asset = error.asset || (error.relation && error.relation.to); const report = { @@ -438,9 +262,10 @@ async function hyperlink( async function processAsset(asset) { if (!processedAssets.has(asset)) { processedAssets.add(asset); + const operator = asset._metadataOnly ? 'external-check' : 'load'; const loadReport = { - operator: 'load', - name: `load ${asset.urlOrDescription}`, + operator, + name: `${operator} ${asset.urlOrDescription}`, expected: `200 ${asset.urlOrDescription}` }; @@ -455,27 +280,63 @@ async function hyperlink( } try { - await asset.load(); + await asset.load({ metadataOnly: asset._metadataOnly }); reportTest({ ...loadReport, ok: true }); } catch (err) { - reportTest({ - ...loadReport, - ok: false, - actual: err.message - }); - return; + if ( + asset._metadataOnly && + err.statusCode && + err.statusCode >= 400 && + err.statusCode <= 600 + ) { + try { + await asset.load(); // Trigger a GET + } catch (err) { + reportTest({ + ...loadReport, + ok: false, + actual: err.message + }); + return; + } + } else { + reportTest({ + ...loadReport, + ok: false, + actual: err.message + }); + return; + } + } + + if (asset.type === 'Html' && !asset._metadataOnly) { + // Remember the set of ids in the document before unloading so incoming fragments can be checked: + asset.ids = new Set(); + for (const element of Array.from( + asset.parseTree.querySelectorAll('[id]') + )) { + asset.ids.add(element.getAttribute('id')); + } } - // In non-recursive mode local assets might be marked as end-of-line. - // This is specifically relevant to local file-URLs if (asset.stopProcessing) { + asset.unload(); return; } + // Save info for the redirect check later + if (asset.statusCode >= 300 && asset.statusCode < 400) { + const redirectRelation = asset.outgoingRelations.find( + r => r.type === 'HttpRedirect' + ); + asset._redirectRelation = redirectRelation; + redirectRelation.to._hasIncomingRedirect = true; + } + for (const relation of asset.externalRelations) { // Only do work for supported protocols if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) { @@ -546,23 +407,18 @@ async function hyperlink( }; if (!shouldSkip(mixedContentReport)) { - if (mixedContentReport.actual !== mixedContentReport.expected) { - reportTest({ - ...mixedContentReport, - ok: false - }); - } else { - reportTest({ - ...mixedContentReport, - ok: true - }); - } + reportTest({ + ...mixedContentReport, + ok: mixedContentReport.actual === mixedContentReport.expected + }); } } let follow; - - if ( + let metadataOnly = asset._metadataOnly; + if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) { + follow = true; + } else if ( ['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type) ) { follow = false; @@ -572,18 +428,11 @@ async function hyperlink( ) { if (!relation.crossorigin && recursive) { follow = true; + } else if (relation.fragment && relation.fragment !== '#') { + follow = true; + relation.to.stopProcessing = true; } else if (relation.from !== relation.to) { - // If we are handling local file-urls, follow but mark as end-of-line in processing - if ( - !recursive && - relation.from.protocol === 'file:' && - relation.to.protocol === 'file:' - ) { - follow = true; - relation.to.stopProcessing = true; - } else { - relation.to.check = true; - } + metadataOnly = true; } } else if ( /^(?:JavaScript|Css)Source(?:Mapping)Url$/.test(relation.type) @@ -591,43 +440,31 @@ async function hyperlink( if (followSourceMaps) { follow = true; } else { - relation.to.check = true; + metadataOnly = true; } } else if ( ['SourceMapFile', 'SourceMapSource'].includes(relation.type) ) { if (followSourceMaps) { - relation.to.check = true; + metadataOnly = true; } } else { follow = true; } - if (follow) { + if (follow || metadataOnly) { if (assetTypesWithoutRelations.includes(relation.to.type)) { - // If we are handling local file-urls, follow but mark as end-of-line in processing - if ( - relation.from.protocol === 'file:' && - relation.to.protocol === 'file:' - ) { - relation.to.stopProcessing = !recursive; - assetQueue.push(relation.to); - } else { - relation.to.check = true; - } + metadataOnly = true; } else { assetQueue.push(relation.to); } - } - } - - if (asset.type === 'Html') { - // Remember the set of ids in the document before unloading so incoming fragments can be checked: - asset.ids = new Set(); - for (const element of Array.from( - asset.parseTree.querySelectorAll('[id]') - )) { - asset.ids.add(element.getAttribute('id')); + if (relation.to._metadataOnly && !metadataOnly) { + // Make sure that we GET an asset that was previously only HEADed + // now that a new relation came about + processedAssets.delete(relation.to); + } + relation.to._metadataOnly = metadataOnly; + assetQueue.push(relation.to); } } @@ -694,27 +531,99 @@ async function hyperlink( } } - // Check urls - const assetsToCheck = ag - .findAssets({ check: true }) - .filter(asset => !processedAssets.has(asset)); - t.push({ - name: `Crawling ${assetsToCheck.length} outgoing urls` - }); + // Check redirects + + function checkRedirectChainFrom(asset, isCycle) { + const redirectChain = [asset]; + let cursor = asset; + cursor._processedRedirect = true; + while ( + cursor._redirectRelation && + !redirectChain.includes(cursor._redirectRelation.to) + ) { + cursor = cursor._redirectRelation.to; + cursor._processedRedirect = true; + redirectChain.push(cursor); + } + let at; + if (asset._incoming && asset._incoming[0].debugDescription) { + at = asset._incoming[0].debugDescription; + } else { + at = `${asset.urlOrDescription} (input URL)`; + } - await new Promise((resolve, reject) => - async.parallelLimit( - assetsToCheck.map(asset => httpStatus(asset)), - 20, - err => { - if (err) { - reject(err); + const redirectReport = { + operator: 'external-redirect', + name: `external-redirect ${asset.url}`, + at, + expected: `302 ${asset.url} --> 200 ${ + redirectChain[redirectChain.length - 1].url + }` + }; + + if (!shouldSkip(redirectReport)) { + // A single temporary redirect is allowed + if (isCycle) { + redirectChain.push(cursor._redirectRelation.to); + } + const actual = redirectChain + .map(asset => `${asset.statusCode} ${asset.url}`) + .join(' --> '); + + if (isCycle) { + reportTest({ + ...redirectReport, + operator: 'redirect-cycle', + actual, + ok: false + }); + } else if ([302, 307].includes(redirectChain[0].statusCode)) { + if (redirectChain.length < 3) { + reportTest({ + ...redirectReport, + expected: actual, + actual, + ok: true + }); } else { - resolve(); + reportTest({ + ...redirectReport, + expected: `${redirectChain[0].statusCode} ${asset.url} --> 200 ${ + redirectChain[redirectChain.length - 1].url + }`, + actual, + ok: false + }); } + } else { + reportTest({ + ...redirectReport, + actual, + ok: false + }); } - ) - ); + } + } + + for (const asset of ag.findAssets({ + _redirectRelation: { $exists: true }, + _hasIncomingRedirect: { $ne: true } + })) { + checkRedirectChainFrom(asset); + } + + // The redirects without _processedRedirect:true at this + // point participate in at least one cycle: + for (const asset of ag + .findAssets({ + _redirectRelation: { $exists: true }, + _processedRedirect: { $ne: true } + }) + .sort((a, b) => parseInt(a.id) - parseInt(b.id))) { + if (!asset._processedRedirect) { + checkRedirectChainFrom(asset, true); + } + } // Check Content-Type vs. incoming relation targetTypes: diff --git a/lib/relationDebugDescription.js b/lib/relationDebugDescription.js index 1f1dc10..f46514c 100644 --- a/lib/relationDebugDescription.js +++ b/lib/relationDebugDescription.js @@ -7,7 +7,7 @@ module.exports = function relationDebugDescription(relation) { var asset = relation.from.nonInlineAncestor; - if (asset.isText) { + if (asset.isText && asset.isLoaded) { var text = asset.rawSrc.toString(); var linesBefore = text.split(relation.href)[0].split('\n'); var charsBefore = linesBefore[linesBefore.length - 1]; diff --git a/test/index.js b/test/index.js index 8a0bc18..b47918c 100644 --- a/test/index.js +++ b/test/index.js @@ -119,7 +119,6 @@ describe('hyperlink', function() { name: 'load https://example.com/', ok: true }); - t.push({ name: 'Crawling 2 outgoing urls' }); t.push(null, { ok: true, name: 'external-check https://google.com' @@ -234,7 +233,16 @@ describe('hyperlink', function() { t ); - expect(t.close(), 'to satisfy', { fail: 1 }); + expect(t.close(), 'to satisfy', { fail: 2 }); + expect(t.push, 'to have a call satisfying', () => { + t.push(null, { + ok: false, + operator: 'content-type-mismatch', + name: 'content-type-mismatch https://example.com/hey.png', + actual: 'Asset is used as both Png and Text', + at: 'https://example.com/ (6:25) ' + }); + }); expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, @@ -288,6 +296,7 @@ describe('hyperlink', function() { ok: false, operator: 'content-type-missing', name: 'content-type-missing https://example.com/hey.png', + actual: 'No Content-Type response header received', at: 'https://example.com/ (6:25) ' }); }); @@ -373,10 +382,6 @@ describe('hyperlink', function() { actual: expect.it('to begin with', 'ENOENT: no such file or directory') }); - t.push({ - name: 'Crawling 0 outgoing urls' - }); - t.push({ name: 'Connecting to 0 hosts (checking ' @@ -409,11 +414,6 @@ describe('hyperlink', function() { skip: 0, todo: 0 }); - expect(t.push, 'to have a call satisfying', () => { - t.push({ - name: 'Crawling 0 outgoing urls' - }); - }); expect(t.push, 'to have no calls satisfying', () => { t.push(null, { operator: 'fragment-check', @@ -448,11 +448,6 @@ describe('hyperlink', function() { skip: 0, todo: 0 }); - expect(t.push, 'to have a call satisfying', () => { - t.push({ - name: 'Crawling 0 outgoing urls' - }); - }); }); }); @@ -622,6 +617,55 @@ describe('hyperlink', function() { }); }); + it('should not issue an error when referencing an external asset with an existing fragment', async function() { + httpception([ + { + request: 'GET https://example.com/', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: + 'Link' + } + }, + { + request: 'GET https://test.external.tools/foo.html', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: + '
I exist
' + } + } + ]); + + const t = new TapRender(); + sinon.spy(t, 'push'); + await hyperlink( + { + recursive: true, + root: 'https://example.com/', + inputUrls: ['https://example.com/'] + }, + t + ); + + expect(t.close(), 'to satisfy', { fail: 0 }); + expect(t.push, 'to have a call satisfying', () => { + t.push(null, { + ok: true, + operator: 'fragment-check', + name: + 'fragment-check https://example.com/ --> https://test.external.tools/foo.html#frag', + expected: 'id="frag"' + }); + }); + }); + it('should be fine when an asset references itself with an empty fragment', async function() { httpception([ { @@ -769,7 +813,6 @@ describe('hyperlink', function() { request: 'HEAD https://mycdn.com/404.eot', response: 404 }, - // retry { request: 'GET https://mycdn.com/404.eot', response: 404 @@ -792,7 +835,7 @@ describe('hyperlink', function() { operator: 'external-check', name: 'external-check https://mycdn.com/404.eot', expected: '200 https://mycdn.com/404.eot', - actual: '404 https://mycdn.com/404.eot' + actual: 'HTTP 404 Not Found' }); }); }); @@ -930,10 +973,9 @@ describe('hyperlink', function() { }, t ); - expect(t.close(), 'to satisfy', { - count: 3, - pass: 3, + count: 4, + pass: 4, fail: 0, skip: 0, todo: 0 @@ -999,8 +1041,8 @@ describe('hyperlink', function() { ); expect(t.close(), 'to satisfy', { - count: 3, - pass: 2, + count: 5, + pass: 4, fail: 1, skip: 0, todo: 0 @@ -1008,6 +1050,8 @@ describe('hyperlink', function() { expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, + at: + 'https://example.com/ (1:35) ...', name: 'external-redirect https://elsewhere.com/', expected: '302 https://elsewhere.com/ --> 200 https://elsewhere.com/finalDestination', @@ -1070,7 +1114,7 @@ describe('hyperlink', function() { t ); - expect(t.close(), 'to satisfy', { fail: 1 }); + expect(t.close(), 'to satisfy', { fail: 2 }); expect(t.push, 'to have a call satisfying', () => { t.push(null, { ok: false, @@ -1082,6 +1126,60 @@ describe('hyperlink', function() { }); }); }); + + it('should report a redirect cycle', async function() { + httpception([ + { + request: 'GET https://example.com/', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: + '' + } + }, + { + request: 'GET https://elsewhere.com/', + response: { + statusCode: 302, + headers: { + Location: 'https://elsewhere.com/redirectTarget' + } + } + }, + { + request: 'GET https://elsewhere.com/redirectTarget', + response: { + statusCode: 302, + headers: { + Location: 'https://elsewhere.com/' + } + } + } + ]); + + const t = new TapRender(); + sinon.spy(t, 'push'); + await hyperlink( + { + root: 'https://example.com/', + inputUrls: ['https://example.com/'] + }, + t + ); + + expect(t.close(), 'to satisfy', { fail: 1 }); + expect(t.push, 'to have a call satisfying', () => { + t.push(null, { + ok: false, + operator: 'redirect-cycle', + actual: + '302 https://elsewhere.com/ --> 302 https://elsewhere.com/redirectTarget --> 302 https://elsewhere.com/' + }); + }); + }); }); describe('with a preconnect link', function() { @@ -1956,8 +2054,70 @@ describe('hyperlink', function() { ok: false, at: 'https://example.com/ (6:25) ', expected: '200 https://example.com/hey.png', - actual: '503 https://example.com/hey.png' + actual: 'HTTP 503 Service Unavailable' }); }); }); + + it('should GET an asset that was previously HEADed if new, "non-external" relations show up', async function() { + httpception([ + { + request: 'GET https://example.com/', + response: { + statusCode: 200, + headers: { + 'Content-Type': 'text/html; charset=UTF-8' + }, + body: ` + + + + + + + + + ` + } + }, + { + request: 'HEAD https://example.com/otherpage.html', + response: { + headers: { + 'Content-Type': 'text/html' + } + } + }, + { + request: 'GET https://example.com/script.js', + response: { + headers: { + 'Content-Type': 'application/javascript' + }, + body: 'alert("Hello " + "/otherpage.html".toString("url"));' + } + }, + { + request: 'GET https://example.com/otherpage.html', + response: { + headers: { + 'Content-Type': 'text/html' + }, + body: '

Hello, world!

' + } + } + ]); + + const t = new TapRender(); + sinon.spy(t, 'push'); + await hyperlink( + { + root: 'https://example.com/', + inputUrls: ['https://example.com/'] + }, + t + ); + + expect(t.close(), 'to satisfy', { fail: 0 }); + }); });