diff --git a/lib/index.js b/lib/index.js
index 895bf98..0374646 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -1,15 +1,12 @@
const AssetGraph = require('assetgraph');
const async = require('async');
-const request = require('request');
const version = require('../package.json').version;
const relationDebugDescription = require('./relationDebugDescription');
const prettyBytes = require('pretty-bytes');
const net = require('net');
const tls = require('tls');
-const defaultSkipFilters = [
- require('./known-culprits/linkedin')
-];
+const defaultSkipFilters = [require('./known-culprits/linkedin')];
const hyperlinkUserAgent = `Hyperlink v${version} (https://www.npmjs.com/package/hyperlink)`;
@@ -161,184 +158,6 @@ async function hyperlink(
};
}
- function httpStatus(asset, attempt = 1) {
- const url = asset.url;
- const relations = asset._incoming;
-
- const loadReport = {
- operator: 'external-check',
- name: `external-check ${url}`,
- at: [...new Set(relations.map(r => r.debugDescription))].join(
- '\n '
- ),
- expected: `200 ${url}`
- };
-
- return callback => {
- if (shouldSkip(loadReport)) {
- return setTimeout(callback);
- }
-
- request(
- {
- method: attempt === 1 ? 'head' : 'get',
- url: asset.url,
- strictSSL: true,
- gzip: true,
- headers: {
- 'User-Agent': hyperlinkUserAgent,
- Accept:
- 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding': 'gzip, deflate, sdch, br'
- }
- },
- (error, res) => {
- if (error) {
- const code = error.code;
- let actual = code || 'Unknown error';
-
- switch (code) {
- case 'ENOTFOUND':
- actual = `DNS missing: ${asset.hostname}`;
- break;
- case 'HPE_INVALID_CONSTANT':
- if (attempt === 1) {
- return httpStatus(asset, attempt + 1)(callback);
- }
- break;
- }
-
- reportTest({
- ...loadReport,
- ok: false,
- actual
- });
-
- return callback();
- }
-
- const status = res.statusCode;
-
- if (status >= 200 && status < 300) {
- const contentType = res.headers['content-type'];
- if (contentType && asset.type) {
- const matchContentType = contentType.match(
- /^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i
- );
- if (matchContentType && asset.expectedTypes) {
- asset.contentType = matchContentType[1].toLowerCase();
- asset._tryUpgrade();
- }
- } else if (!contentType) {
- const contentTypeMisingReport = {
- ok: false,
- name: `content-type-missing ${asset.urlOrDescription}`,
- operator: 'content-type-missing',
- expected:
- asset.contentType ||
- `A Content-Type compatible with ${asset.type}`,
- actual: contentType,
- at: [...new Set(relations.map(r => r.debugDescription))].join(
- '\n '
- )
- };
-
- if (!shouldSkip(contentTypeMisingReport)) {
- reportTest(contentTypeMisingReport);
- }
- }
- }
-
- // Some servers respond weirdly to HEAD requests. Make a second attempt with GET
- if (attempt === 1 && status >= 400 && status < 600) {
- return httpStatus(asset, attempt + 1)(callback);
- }
-
- // Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
- if (attempt === 2 && status === 502) {
- setTimeout(() => httpStatus(asset, attempt + 1)(callback), 1000);
- return;
- }
-
- const redirects = res.request._redirect.redirects;
- if (redirects.length > 0) {
- const log = [{ redirectUri: url }, ...redirects].map(
- (item, idx, arr) => {
- if (arr[idx + 1]) {
- item.statusCode = arr[idx + 1].statusCode;
- } else {
- item.statusCode = 200;
- }
-
- return item;
- }
- );
-
- const redirectReport = {
- operator: 'external-redirect',
- name: `external-redirect ${url}`,
- at: [...new Set(relations.map(r => r.debugDescription))].join(
- '\n '
- ),
- expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}`
- };
-
- const actual = log
- .map(redirect => `${redirect.statusCode} ${redirect.redirectUri}`)
- .join(' --> ');
-
- if (!shouldSkip(redirectReport)) {
- // A single temporary redirect is allowed
- if ([302, 307].includes(log[0].statusCode)) {
- if (log.length < 3) {
- reportTest({
- ...redirectReport,
- expected: actual,
- actual,
- ok: true
- });
- } else {
- reportTest({
- ...redirectReport,
- expected: `${log[0].statusCode} ${url} --> 200 ${
- log[log.length - 1].redirectUri
- }`,
- actual,
- ok: false
- });
- }
- } else {
- reportTest({
- ...redirectReport,
- actual,
- ok: false
- });
- }
- }
- }
-
- if (status === 200) {
- reportTest({
- ...loadReport,
- ok: true,
- actual: loadReport.expected
- });
-
- return callback();
- }
-
- reportTest({
- ...loadReport,
- actual: `${status} ${url}`,
- ok: false
- });
-
- return callback();
- }
- );
- };
- }
-
if (verbose) {
ag.on('addRelation', relation => {
console.error('addRelation', relation.toString());
@@ -349,16 +168,22 @@ async function hyperlink(
}
function handleError(error) {
- // Explicitly handle incompatible types warning
+ // Detect and upgrade certain errors from AssetGraph:
+ let operator;
if (error.stack && error.stack.includes('_warnIncompatibleTypes')) {
+ operator = 'content-type-mismatch';
+ } else if (error.message === 'No Content-Type response header received') {
+ operator = 'content-type-missing';
+ }
+ if (operator) {
const asset = error.asset;
const expected =
asset.contentType || `A Content-Type compatible with ${asset.type}`;
const contentTypeMismatchReport = {
ok: false,
- operator: 'content-type-mismatch',
- name: `content-type-mismatch ${asset.urlOrDescription}`,
+ operator,
+ name: `${operator} ${asset.urlOrDescription}`,
expected,
actual: error.message,
at: [...new Set(asset._incoming.map(r => r.debugDescription))].join(
@@ -372,7 +197,6 @@ async function hyperlink(
return;
}
-
const message = error.message || error;
const asset = error.asset || (error.relation && error.relation.to);
const report = {
@@ -438,9 +262,10 @@ async function hyperlink(
async function processAsset(asset) {
if (!processedAssets.has(asset)) {
processedAssets.add(asset);
+ const operator = asset._metadataOnly ? 'external-check' : 'load';
const loadReport = {
- operator: 'load',
- name: `load ${asset.urlOrDescription}`,
+ operator,
+ name: `${operator} ${asset.urlOrDescription}`,
expected: `200 ${asset.urlOrDescription}`
};
@@ -455,27 +280,63 @@ async function hyperlink(
}
try {
- await asset.load();
+ await asset.load({ metadataOnly: asset._metadataOnly });
reportTest({
...loadReport,
ok: true
});
} catch (err) {
- reportTest({
- ...loadReport,
- ok: false,
- actual: err.message
- });
- return;
+ if (
+ asset._metadataOnly &&
+ err.statusCode &&
+ err.statusCode >= 400 &&
+ err.statusCode <= 600
+ ) {
+ try {
+ await asset.load(); // Trigger a GET
+ } catch (err) {
+ reportTest({
+ ...loadReport,
+ ok: false,
+ actual: err.message
+ });
+ return;
+ }
+ } else {
+ reportTest({
+ ...loadReport,
+ ok: false,
+ actual: err.message
+ });
+ return;
+ }
+ }
+
+ if (asset.type === 'Html' && !asset._metadataOnly) {
+ // Remember the set of ids in the document before unloading so incoming fragments can be checked:
+ asset.ids = new Set();
+ for (const element of Array.from(
+ asset.parseTree.querySelectorAll('[id]')
+ )) {
+ asset.ids.add(element.getAttribute('id'));
+ }
}
- // In non-recursive mode local assets might be marked as end-of-line.
- // This is specifically relevant to local file-URLs
if (asset.stopProcessing) {
+ asset.unload();
return;
}
+ // Save info for the redirect check later
+ if (asset.statusCode >= 300 && asset.statusCode < 400) {
+ const redirectRelation = asset.outgoingRelations.find(
+ r => r.type === 'HttpRedirect'
+ );
+ asset._redirectRelation = redirectRelation;
+ redirectRelation.to._hasIncomingRedirect = true;
+ }
+
for (const relation of asset.externalRelations) {
// Only do work for supported protocols
if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) {
@@ -546,23 +407,18 @@ async function hyperlink(
};
if (!shouldSkip(mixedContentReport)) {
- if (mixedContentReport.actual !== mixedContentReport.expected) {
- reportTest({
- ...mixedContentReport,
- ok: false
- });
- } else {
- reportTest({
- ...mixedContentReport,
- ok: true
- });
- }
+ reportTest({
+ ...mixedContentReport,
+ ok: mixedContentReport.actual === mixedContentReport.expected
+ });
}
}
let follow;
-
- if (
+ let metadataOnly = asset._metadataOnly;
+ if (['HttpRedirect', 'FileRedirect'].includes(relation.type)) {
+ follow = true;
+ } else if (
['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)
) {
follow = false;
@@ -572,18 +428,11 @@ async function hyperlink(
) {
if (!relation.crossorigin && recursive) {
follow = true;
+ } else if (relation.fragment && relation.fragment !== '#') {
+ follow = true;
+ relation.to.stopProcessing = true;
} else if (relation.from !== relation.to) {
- // If we are handling local file-urls, follow but mark as end-of-line in processing
- if (
- !recursive &&
- relation.from.protocol === 'file:' &&
- relation.to.protocol === 'file:'
- ) {
- follow = true;
- relation.to.stopProcessing = true;
- } else {
- relation.to.check = true;
- }
+ metadataOnly = true;
}
} else if (
/^(?:JavaScript|Css)Source(?:Mapping)Url$/.test(relation.type)
@@ -591,43 +440,31 @@ async function hyperlink(
if (followSourceMaps) {
follow = true;
} else {
- relation.to.check = true;
+ metadataOnly = true;
}
} else if (
['SourceMapFile', 'SourceMapSource'].includes(relation.type)
) {
if (followSourceMaps) {
- relation.to.check = true;
+ metadataOnly = true;
}
} else {
follow = true;
}
- if (follow) {
+ if (follow || metadataOnly) {
if (assetTypesWithoutRelations.includes(relation.to.type)) {
- // If we are handling local file-urls, follow but mark as end-of-line in processing
- if (
- relation.from.protocol === 'file:' &&
- relation.to.protocol === 'file:'
- ) {
- relation.to.stopProcessing = !recursive;
- assetQueue.push(relation.to);
- } else {
- relation.to.check = true;
- }
+ metadataOnly = true;
} else {
assetQueue.push(relation.to);
}
- }
- }
-
- if (asset.type === 'Html') {
- // Remember the set of ids in the document before unloading so incoming fragments can be checked:
- asset.ids = new Set();
- for (const element of Array.from(
- asset.parseTree.querySelectorAll('[id]')
- )) {
- asset.ids.add(element.getAttribute('id'));
+ if (relation.to._metadataOnly && !metadataOnly) {
+ // Make sure that we GET an asset that was previously only HEADed
+ // now that a new relation came about
+ processedAssets.delete(relation.to);
+ }
+ relation.to._metadataOnly = metadataOnly;
+ assetQueue.push(relation.to);
}
}
@@ -694,27 +531,99 @@ async function hyperlink(
}
}
- // Check urls
- const assetsToCheck = ag
- .findAssets({ check: true })
- .filter(asset => !processedAssets.has(asset));
- t.push({
- name: `Crawling ${assetsToCheck.length} outgoing urls`
- });
+ // Check redirects
+
+ function checkRedirectChainFrom(asset, isCycle) {
+ const redirectChain = [asset];
+ let cursor = asset;
+ cursor._processedRedirect = true;
+ while (
+ cursor._redirectRelation &&
+ !redirectChain.includes(cursor._redirectRelation.to)
+ ) {
+ cursor = cursor._redirectRelation.to;
+ cursor._processedRedirect = true;
+ redirectChain.push(cursor);
+ }
+ let at;
+ if (asset._incoming && asset._incoming[0].debugDescription) {
+ at = asset._incoming[0].debugDescription;
+ } else {
+ at = `${asset.urlOrDescription} (input URL)`;
+ }
- await new Promise((resolve, reject) =>
- async.parallelLimit(
- assetsToCheck.map(asset => httpStatus(asset)),
- 20,
- err => {
- if (err) {
- reject(err);
+ const redirectReport = {
+ operator: 'external-redirect',
+ name: `external-redirect ${asset.url}`,
+ at,
+ expected: `302 ${asset.url} --> 200 ${
+ redirectChain[redirectChain.length - 1].url
+ }`
+ };
+
+ if (!shouldSkip(redirectReport)) {
+ // A single temporary redirect is allowed
+ if (isCycle) {
+ redirectChain.push(cursor._redirectRelation.to);
+ }
+ const actual = redirectChain
+ .map(asset => `${asset.statusCode} ${asset.url}`)
+ .join(' --> ');
+
+ if (isCycle) {
+ reportTest({
+ ...redirectReport,
+ operator: 'redirect-cycle',
+ actual,
+ ok: false
+ });
+ } else if ([302, 307].includes(redirectChain[0].statusCode)) {
+ if (redirectChain.length < 3) {
+ reportTest({
+ ...redirectReport,
+ expected: actual,
+ actual,
+ ok: true
+ });
} else {
- resolve();
+ reportTest({
+ ...redirectReport,
+ expected: `${redirectChain[0].statusCode} ${asset.url} --> 200 ${
+ redirectChain[redirectChain.length - 1].url
+ }`,
+ actual,
+ ok: false
+ });
}
+ } else {
+ reportTest({
+ ...redirectReport,
+ actual,
+ ok: false
+ });
}
- )
- );
+ }
+ }
+
+ for (const asset of ag.findAssets({
+ _redirectRelation: { $exists: true },
+ _hasIncomingRedirect: { $ne: true }
+ })) {
+ checkRedirectChainFrom(asset);
+ }
+
+ // The redirects without _processedRedirect:true at this
+ // point participate in at least one cycle:
+ for (const asset of ag
+ .findAssets({
+ _redirectRelation: { $exists: true },
+ _processedRedirect: { $ne: true }
+ })
+ .sort((a, b) => parseInt(a.id) - parseInt(b.id))) {
+ if (!asset._processedRedirect) {
+ checkRedirectChainFrom(asset, true);
+ }
+ }
// Check Content-Type vs. incoming relation targetTypes:
diff --git a/lib/relationDebugDescription.js b/lib/relationDebugDescription.js
index 1f1dc10..f46514c 100644
--- a/lib/relationDebugDescription.js
+++ b/lib/relationDebugDescription.js
@@ -7,7 +7,7 @@ module.exports = function relationDebugDescription(relation) {
var asset = relation.from.nonInlineAncestor;
- if (asset.isText) {
+ if (asset.isText && asset.isLoaded) {
var text = asset.rawSrc.toString();
var linesBefore = text.split(relation.href)[0].split('\n');
var charsBefore = linesBefore[linesBefore.length - 1];
diff --git a/test/index.js b/test/index.js
index 8a0bc18..b47918c 100644
--- a/test/index.js
+++ b/test/index.js
@@ -119,7 +119,6 @@ describe('hyperlink', function() {
name: 'load https://example.com/',
ok: true
});
- t.push({ name: 'Crawling 2 outgoing urls' });
t.push(null, {
ok: true,
name: 'external-check https://google.com'
@@ -234,7 +233,16 @@ describe('hyperlink', function() {
t
);
- expect(t.close(), 'to satisfy', { fail: 1 });
+ expect(t.close(), 'to satisfy', { fail: 2 });
+ expect(t.push, 'to have a call satisfying', () => {
+ t.push(null, {
+ ok: false,
+ operator: 'content-type-mismatch',
+ name: 'content-type-mismatch https://example.com/hey.png',
+ actual: 'Asset is used as both Png and Text',
+ at: 'https://example.com/ (6:25)
'
+ });
+ });
expect(t.push, 'to have a call satisfying', () => {
t.push(null, {
ok: false,
@@ -288,6 +296,7 @@ describe('hyperlink', function() {
ok: false,
operator: 'content-type-missing',
name: 'content-type-missing https://example.com/hey.png',
+ actual: 'No Content-Type response header received',
at: 'https://example.com/ (6:25)
'
});
});
@@ -373,10 +382,6 @@ describe('hyperlink', function() {
actual: expect.it('to begin with', 'ENOENT: no such file or directory')
});
- t.push({
- name: 'Crawling 0 outgoing urls'
- });
-
t.push({
name:
'Connecting to 0 hosts (checking '
@@ -409,11 +414,6 @@ describe('hyperlink', function() {
skip: 0,
todo: 0
});
- expect(t.push, 'to have a call satisfying', () => {
- t.push({
- name: 'Crawling 0 outgoing urls'
- });
- });
expect(t.push, 'to have no calls satisfying', () => {
t.push(null, {
operator: 'fragment-check',
@@ -448,11 +448,6 @@ describe('hyperlink', function() {
skip: 0,
todo: 0
});
- expect(t.push, 'to have a call satisfying', () => {
- t.push({
- name: 'Crawling 0 outgoing urls'
- });
- });
});
});
@@ -622,6 +617,55 @@ describe('hyperlink', function() {
});
});
+ it('should not issue an error when referencing an external asset with an existing fragment', async function() {
+ httpception([
+ {
+ request: 'GET https://example.com/',
+ response: {
+ statusCode: 200,
+ headers: {
+ 'Content-Type': 'text/html; charset=UTF-8'
+ },
+ body:
+ '

',
expected: '200 https://example.com/hey.png',
- actual: '503 https://example.com/hey.png'
+ actual: 'HTTP 503 Service Unavailable'
});
});
});
+
+ it('should GET an asset that was previously HEADed if new, "non-external" relations show up', async function() {
+ httpception([
+ {
+ request: 'GET https://example.com/',
+ response: {
+ statusCode: 200,
+ headers: {
+ 'Content-Type': 'text/html; charset=UTF-8'
+ },
+ body: `
+
+
+
+
+
+
+
+
+ `
+ }
+ },
+ {
+ request: 'HEAD https://example.com/otherpage.html',
+ response: {
+ headers: {
+ 'Content-Type': 'text/html'
+ }
+ }
+ },
+ {
+ request: 'GET https://example.com/script.js',
+ response: {
+ headers: {
+ 'Content-Type': 'application/javascript'
+ },
+ body: 'alert("Hello " + "/otherpage.html".toString("url"));'
+ }
+ },
+ {
+ request: 'GET https://example.com/otherpage.html',
+ response: {
+ headers: {
+ 'Content-Type': 'text/html'
+ },
+ body: '