balmet.com

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

IDNAEncoder.php (11547B)


      1 <?php
      2 
      3 /**
      4  * IDNA URL encoder
      5  *
      6  * Note: Not fully compliant, as nameprep does nothing yet.
      7  *
      8  * @package Requests
      9  * @subpackage Utilities
     10  * @see https://tools.ietf.org/html/rfc3490 IDNA specification
     11  * @see https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification
     12  */
     13 class Requests_IDNAEncoder {
     14 	/**
     15 	 * ACE prefix used for IDNA
     16 	 *
     17 	 * @see https://tools.ietf.org/html/rfc3490#section-5
     18 	 * @var string
     19 	 */
     20 	const ACE_PREFIX = 'xn--';
     21 
     22 	/**#@+
     23 	 * Bootstrap constant for Punycode
     24 	 *
     25 	 * @see https://tools.ietf.org/html/rfc3492#section-5
     26 	 * @var int
     27 	 */
     28 	const BOOTSTRAP_BASE         = 36;
     29 	const BOOTSTRAP_TMIN         = 1;
     30 	const BOOTSTRAP_TMAX         = 26;
     31 	const BOOTSTRAP_SKEW         = 38;
     32 	const BOOTSTRAP_DAMP         = 700;
     33 	const BOOTSTRAP_INITIAL_BIAS = 72;
     34 	const BOOTSTRAP_INITIAL_N    = 128;
     35 	/**#@-*/
     36 
     37 	/**
     38 	 * Encode a hostname using Punycode
     39 	 *
     40 	 * @param string $string Hostname
     41 	 * @return string Punycode-encoded hostname
     42 	 */
     43 	public static function encode($string) {
     44 		$parts = explode('.', $string);
     45 		foreach ($parts as &$part) {
     46 			$part = self::to_ascii($part);
     47 		}
     48 		return implode('.', $parts);
     49 	}
     50 
     51 	/**
     52 	 * Convert a UTF-8 string to an ASCII string using Punycode
     53 	 *
     54 	 * @throws Requests_Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`)
     55 	 * @throws Requests_Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`)
     56 	 * @throws Requests_Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`)
     57 	 * @throws Requests_Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`)
     58 	 *
     59 	 * @param string $string ASCII or UTF-8 string (max length 64 characters)
     60 	 * @return string ASCII string
     61 	 */
     62 	public static function to_ascii($string) {
     63 		// Step 1: Check if the string is already ASCII
     64 		if (self::is_ascii($string)) {
     65 			// Skip to step 7
     66 			if (strlen($string) < 64) {
     67 				return $string;
     68 			}
     69 
     70 			throw new Requests_Exception('Provided string is too long', 'idna.provided_too_long', $string);
     71 		}
     72 
     73 		// Step 2: nameprep
     74 		$string = self::nameprep($string);
     75 
     76 		// Step 3: UseSTD3ASCIIRules is false, continue
     77 		// Step 4: Check if it's ASCII now
     78 		if (self::is_ascii($string)) {
     79 			// Skip to step 7
     80 			if (strlen($string) < 64) {
     81 				return $string;
     82 			}
     83 
     84 			throw new Requests_Exception('Prepared string is too long', 'idna.prepared_too_long', $string);
     85 		}
     86 
     87 		// Step 5: Check ACE prefix
     88 		if (strpos($string, self::ACE_PREFIX) === 0) {
     89 			throw new Requests_Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $string);
     90 		}
     91 
     92 		// Step 6: Encode with Punycode
     93 		$string = self::punycode_encode($string);
     94 
     95 		// Step 7: Prepend ACE prefix
     96 		$string = self::ACE_PREFIX . $string;
     97 
     98 		// Step 8: Check size
     99 		if (strlen($string) < 64) {
    100 			return $string;
    101 		}
    102 
    103 		throw new Requests_Exception('Encoded string is too long', 'idna.encoded_too_long', $string);
    104 	}
    105 
    106 	/**
    107 	 * Check whether a given string contains only ASCII characters
    108 	 *
    109 	 * @internal (Testing found regex was the fastest implementation)
    110 	 *
    111 	 * @param string $string
    112 	 * @return bool Is the string ASCII-only?
    113 	 */
    114 	protected static function is_ascii($string) {
    115 		return (preg_match('/(?:[^\x00-\x7F])/', $string) !== 1);
    116 	}
    117 
    118 	/**
    119 	 * Prepare a string for use as an IDNA name
    120 	 *
    121 	 * @todo Implement this based on RFC 3491 and the newer 5891
    122 	 * @param string $string
    123 	 * @return string Prepared string
    124 	 */
    125 	protected static function nameprep($string) {
    126 		return $string;
    127 	}
    128 
    129 	/**
    130 	 * Convert a UTF-8 string to a UCS-4 codepoint array
    131 	 *
    132 	 * Based on Requests_IRI::replace_invalid_with_pct_encoding()
    133 	 *
    134 	 * @throws Requests_Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`)
    135 	 * @param string $input
    136 	 * @return array Unicode code points
    137 	 */
    138 	protected static function utf8_to_codepoints($input) {
    139 		$codepoints = array();
    140 
    141 		// Get number of bytes
    142 		$strlen = strlen($input);
    143 
    144 		// phpcs:ignore Generic.CodeAnalysis.JumbledIncrementer -- This is a deliberate choice.
    145 		for ($position = 0; $position < $strlen; $position++) {
    146 			$value = ord($input[$position]);
    147 
    148 			// One byte sequence:
    149 			if ((~$value & 0x80) === 0x80) {
    150 				$character = $value;
    151 				$length    = 1;
    152 				$remaining = 0;
    153 			}
    154 			// Two byte sequence:
    155 			elseif (($value & 0xE0) === 0xC0) {
    156 				$character = ($value & 0x1F) << 6;
    157 				$length    = 2;
    158 				$remaining = 1;
    159 			}
    160 			// Three byte sequence:
    161 			elseif (($value & 0xF0) === 0xE0) {
    162 				$character = ($value & 0x0F) << 12;
    163 				$length    = 3;
    164 				$remaining = 2;
    165 			}
    166 			// Four byte sequence:
    167 			elseif (($value & 0xF8) === 0xF0) {
    168 				$character = ($value & 0x07) << 18;
    169 				$length    = 4;
    170 				$remaining = 3;
    171 			}
    172 			// Invalid byte:
    173 			else {
    174 				throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value);
    175 			}
    176 
    177 			if ($remaining > 0) {
    178 				if ($position + $length > $strlen) {
    179 					throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
    180 				}
    181 				for ($position++; $remaining > 0; $position++) {
    182 					$value = ord($input[$position]);
    183 
    184 					// If it is invalid, count the sequence as invalid and reprocess the current byte:
    185 					if (($value & 0xC0) !== 0x80) {
    186 						throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
    187 					}
    188 
    189 					--$remaining;
    190 					$character |= ($value & 0x3F) << ($remaining * 6);
    191 				}
    192 				$position--;
    193 			}
    194 
    195 			if (// Non-shortest form sequences are invalid
    196 				$length > 1 && $character <= 0x7F
    197 				|| $length > 2 && $character <= 0x7FF
    198 				|| $length > 3 && $character <= 0xFFFF
    199 				// Outside of range of ucschar codepoints
    200 				// Noncharacters
    201 				|| ($character & 0xFFFE) === 0xFFFE
    202 				|| $character >= 0xFDD0 && $character <= 0xFDEF
    203 				|| (
    204 					// Everything else not in ucschar
    205 					$character > 0xD7FF && $character < 0xF900
    206 					|| $character < 0x20
    207 					|| $character > 0x7E && $character < 0xA0
    208 					|| $character > 0xEFFFD
    209 				)
    210 			) {
    211 				throw new Requests_Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
    212 			}
    213 
    214 			$codepoints[] = $character;
    215 		}
    216 
    217 		return $codepoints;
    218 	}
    219 
    220 	/**
    221 	 * RFC3492-compliant encoder
    222 	 *
    223 	 * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code
    224 	 * @throws Requests_Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`)
    225 	 *
    226 	 * @param string $input UTF-8 encoded string to encode
    227 	 * @return string Punycode-encoded string
    228 	 */
    229 	public static function punycode_encode($input) {
    230 		$output = '';
    231 		// let n = initial_n
    232 		$n = self::BOOTSTRAP_INITIAL_N;
    233 		// let delta = 0
    234 		$delta = 0;
    235 		// let bias = initial_bias
    236 		$bias = self::BOOTSTRAP_INITIAL_BIAS;
    237 		// let h = b = the number of basic code points in the input
    238 		$h = 0;
    239 		$b = 0; // see loop
    240 		// copy them to the output in order
    241 		$codepoints = self::utf8_to_codepoints($input);
    242 		$extended   = array();
    243 
    244 		foreach ($codepoints as $char) {
    245 			if ($char < 128) {
    246 				// Character is valid ASCII
    247 				// TODO: this should also check if it's valid for a URL
    248 				$output .= chr($char);
    249 				$h++;
    250 			}
    251 			// Check if the character is non-ASCII, but below initial n
    252 			// This never occurs for Punycode, so ignore in coverage
    253 			// @codeCoverageIgnoreStart
    254 			elseif ($char < $n) {
    255 				throw new Requests_Exception('Invalid character', 'idna.character_outside_domain', $char);
    256 			}
    257 			// @codeCoverageIgnoreEnd
    258 			else {
    259 				$extended[$char] = true;
    260 			}
    261 		}
    262 		$extended = array_keys($extended);
    263 		sort($extended);
    264 		$b = $h;
    265 		// [copy them] followed by a delimiter if b > 0
    266 		if (strlen($output) > 0) {
    267 			$output .= '-';
    268 		}
    269 		// {if the input contains a non-basic code point < n then fail}
    270 		// while h < length(input) do begin
    271 		$codepointcount = count($codepoints);
    272 		while ($h < $codepointcount) {
    273 			// let m = the minimum code point >= n in the input
    274 			$m = array_shift($extended);
    275 			//printf('next code point to insert is %s' . PHP_EOL, dechex($m));
    276 			// let delta = delta + (m - n) * (h + 1), fail on overflow
    277 			$delta += ($m - $n) * ($h + 1);
    278 			// let n = m
    279 			$n = $m;
    280 			// for each code point c in the input (in order) do begin
    281 			for ($num = 0; $num < $codepointcount; $num++) {
    282 				$c = $codepoints[$num];
    283 				// if c < n then increment delta, fail on overflow
    284 				if ($c < $n) {
    285 					$delta++;
    286 				}
    287 				// if c == n then begin
    288 				elseif ($c === $n) {
    289 					// let q = delta
    290 					$q = $delta;
    291 					// for k = base to infinity in steps of base do begin
    292 					for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) {
    293 						// let t = tmin if k <= bias {+ tmin}, or
    294 						//     tmax if k >= bias + tmax, or k - bias otherwise
    295 						if ($k <= ($bias + self::BOOTSTRAP_TMIN)) {
    296 							$t = self::BOOTSTRAP_TMIN;
    297 						}
    298 						elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) {
    299 							$t = self::BOOTSTRAP_TMAX;
    300 						}
    301 						else {
    302 							$t = $k - $bias;
    303 						}
    304 						// if q < t then break
    305 						if ($q < $t) {
    306 							break;
    307 						}
    308 						// output the code point for digit t + ((q - t) mod (base - t))
    309 						$digit   = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t));
    310 						$output .= self::digit_to_char($digit);
    311 						// let q = (q - t) div (base - t)
    312 						$q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t));
    313 					} // end
    314 					// output the code point for digit q
    315 					$output .= self::digit_to_char($q);
    316 					// let bias = adapt(delta, h + 1, test h equals b?)
    317 					$bias = self::adapt($delta, $h + 1, $h === $b);
    318 					// let delta = 0
    319 					$delta = 0;
    320 					// increment h
    321 					$h++;
    322 				} // end
    323 			} // end
    324 			// increment delta and n
    325 			$delta++;
    326 			$n++;
    327 		} // end
    328 
    329 		return $output;
    330 	}
    331 
    332 	/**
    333 	 * Convert a digit to its respective character
    334 	 *
    335 	 * @see https://tools.ietf.org/html/rfc3492#section-5
    336 	 * @throws Requests_Exception On invalid digit (`idna.invalid_digit`)
    337 	 *
    338 	 * @param int $digit Digit in the range 0-35
    339 	 * @return string Single character corresponding to digit
    340 	 */
    341 	protected static function digit_to_char($digit) {
    342 		// @codeCoverageIgnoreStart
    343 		// As far as I know, this never happens, but still good to be sure.
    344 		if ($digit < 0 || $digit > 35) {
    345 			throw new Requests_Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit);
    346 		}
    347 		// @codeCoverageIgnoreEnd
    348 		$digits = 'abcdefghijklmnopqrstuvwxyz0123456789';
    349 		return substr($digits, $digit, 1);
    350 	}
    351 
    352 	/**
    353 	 * Adapt the bias
    354 	 *
    355 	 * @see https://tools.ietf.org/html/rfc3492#section-6.1
    356 	 * @param int $delta
    357 	 * @param int $numpoints
    358 	 * @param bool $firsttime
    359 	 * @return int New bias
    360 	 *
    361 	 * function adapt(delta,numpoints,firsttime):
    362 	 */
    363 	protected static function adapt($delta, $numpoints, $firsttime) {
    364 		// if firsttime then let delta = delta div damp
    365 		if ($firsttime) {
    366 			$delta = floor($delta / self::BOOTSTRAP_DAMP);
    367 		}
    368 		// else let delta = delta div 2
    369 		else {
    370 			$delta = floor($delta / 2);
    371 		}
    372 		// let delta = delta + (delta div numpoints)
    373 		$delta += floor($delta / $numpoints);
    374 		// let k = 0
    375 		$k = 0;
    376 		// while delta > ((base - tmin) * tmax) div 2 do begin
    377 		$max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2);
    378 		while ($delta > $max) {
    379 			// let delta = delta div (base - tmin)
    380 			$delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN));
    381 			// let k = k + base
    382 			$k += self::BOOTSTRAP_BASE;
    383 		} // end
    384 		// return k + (((base - tmin + 1) * delta) div (delta + skew))
    385 		return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW));
    386 	}
    387 }