211 lines
4.5 KiB
JavaScript
211 lines
4.5 KiB
JavaScript
'use strict'
|
||
|
||
var ccount = require('ccount')
|
||
var decode = require('parse-entities')
|
||
var decimal = require('is-decimal')
|
||
var alphabetical = require('is-alphabetical')
|
||
var whitespace = require('is-whitespace-character')
|
||
var locate = require('../locate/url')
|
||
|
||
module.exports = url
|
||
url.locator = locate
|
||
url.notInLink = true
|
||
|
||
var exclamationMark = 33 // '!'
|
||
var ampersand = 38 // '&'
|
||
var rightParenthesis = 41 // ')'
|
||
var asterisk = 42 // '*'
|
||
var comma = 44 // ','
|
||
var dash = 45 // '-'
|
||
var dot = 46 // '.'
|
||
var colon = 58 // ':'
|
||
var semicolon = 59 // ';'
|
||
var questionMark = 63 // '?'
|
||
var lessThan = 60 // '<'
|
||
var underscore = 95 // '_'
|
||
var tilde = 126 // '~'
|
||
|
||
var leftParenthesisCharacter = '('
|
||
var rightParenthesisCharacter = ')'
|
||
|
||
function url(eat, value, silent) {
|
||
var self = this
|
||
var gfm = self.options.gfm
|
||
var tokenizers = self.inlineTokenizers
|
||
var length = value.length
|
||
var previousDot = -1
|
||
var protocolless = false
|
||
var dots
|
||
var lastTwoPartsStart
|
||
var start
|
||
var index
|
||
var pathStart
|
||
var path
|
||
var code
|
||
var end
|
||
var leftCount
|
||
var rightCount
|
||
var content
|
||
var children
|
||
var url
|
||
var exit
|
||
|
||
if (!gfm) {
|
||
return
|
||
}
|
||
|
||
// `WWW.` doesn’t work.
|
||
if (value.slice(0, 4) === 'www.') {
|
||
protocolless = true
|
||
index = 4
|
||
} else if (value.slice(0, 7).toLowerCase() === 'http://') {
|
||
index = 7
|
||
} else if (value.slice(0, 8).toLowerCase() === 'https://') {
|
||
index = 8
|
||
} else {
|
||
return
|
||
}
|
||
|
||
// Act as if the starting boundary is a dot.
|
||
previousDot = index - 1
|
||
|
||
// Parse a valid domain.
|
||
start = index
|
||
dots = []
|
||
|
||
while (index < length) {
|
||
code = value.charCodeAt(index)
|
||
|
||
if (code === dot) {
|
||
// Dots may not appear after each other.
|
||
if (previousDot === index - 1) {
|
||
break
|
||
}
|
||
|
||
dots.push(index)
|
||
previousDot = index
|
||
index++
|
||
continue
|
||
}
|
||
|
||
if (
|
||
decimal(code) ||
|
||
alphabetical(code) ||
|
||
code === dash ||
|
||
code === underscore
|
||
) {
|
||
index++
|
||
continue
|
||
}
|
||
|
||
break
|
||
}
|
||
|
||
// Ignore a final dot:
|
||
if (code === dot) {
|
||
dots.pop()
|
||
index--
|
||
}
|
||
|
||
// If there are not dots, exit.
|
||
if (dots[0] === undefined) {
|
||
return
|
||
}
|
||
|
||
// If there is an underscore in the last two domain parts, exit:
|
||
// `www.example.c_m` and `www.ex_ample.com` are not OK, but
|
||
// `www.sub_domain.example.com` is.
|
||
lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1
|
||
|
||
if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) {
|
||
return
|
||
}
|
||
|
||
/* istanbul ignore if - never used (yet) */
|
||
if (silent) {
|
||
return true
|
||
}
|
||
|
||
end = index
|
||
pathStart = index
|
||
|
||
// Parse a path.
|
||
while (index < length) {
|
||
code = value.charCodeAt(index)
|
||
|
||
if (whitespace(code) || code === lessThan) {
|
||
break
|
||
}
|
||
|
||
index++
|
||
|
||
if (
|
||
code === exclamationMark ||
|
||
code === asterisk ||
|
||
code === comma ||
|
||
code === dot ||
|
||
code === colon ||
|
||
code === questionMark ||
|
||
code === underscore ||
|
||
code === tilde
|
||
) {
|
||
// Empty
|
||
} else {
|
||
end = index
|
||
}
|
||
}
|
||
|
||
index = end
|
||
|
||
// If the path ends in a closing paren, and the count of closing parens is
|
||
// higher than the opening count, then remove the supefluous closing parens.
|
||
if (value.charCodeAt(index - 1) === rightParenthesis) {
|
||
path = value.slice(pathStart, index)
|
||
leftCount = ccount(path, leftParenthesisCharacter)
|
||
rightCount = ccount(path, rightParenthesisCharacter)
|
||
|
||
while (rightCount > leftCount) {
|
||
index = pathStart + path.lastIndexOf(rightParenthesisCharacter)
|
||
path = value.slice(pathStart, index)
|
||
rightCount--
|
||
}
|
||
}
|
||
|
||
if (value.charCodeAt(index - 1) === semicolon) {
|
||
// GitHub doesn’t document this, but final semicolons aren’t paret of the
|
||
// URL either.
|
||
index--
|
||
|
||
// // If the path ends in what looks like an entity, it’s not part of the path.
|
||
if (alphabetical(value.charCodeAt(index - 1))) {
|
||
end = index - 2
|
||
|
||
while (alphabetical(value.charCodeAt(end))) {
|
||
end--
|
||
}
|
||
|
||
if (value.charCodeAt(end) === ampersand) {
|
||
index = end
|
||
}
|
||
}
|
||
}
|
||
|
||
content = value.slice(0, index)
|
||
url = decode(content, {nonTerminated: false})
|
||
|
||
if (protocolless) {
|
||
url = 'http://' + url
|
||
}
|
||
|
||
exit = self.enterLink()
|
||
|
||
// Temporarily remove all tokenizers except text in url.
|
||
self.inlineTokenizers = {text: tokenizers.text}
|
||
children = self.tokenizeInline(content, eat.now())
|
||
self.inlineTokenizers = tokenizers
|
||
|
||
exit()
|
||
|
||
return eat(content)({type: 'link', title: null, url: url, children: children})
|
||
}
|