parse link references as blocks to improve performance

rearrange block types to optimize performance
escaping for special characters
2023-08-10 21:13:06 +03:00 · 2013-11-05 00:57:16 +02:00 · 2013-11-04 09:28:50 +02:00 · 2013-11-03 17:32:45 +02:00 · 2013-11-02 21:42:55 +02:00 · 2013-11-02 02:18:13 +02:00
40 changed files with 768 additions and 536 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .DS_Store
+.idea
 nbproject
--- a/.travis.yml
+++ b/.travis.yml
@ -4,4 +4,3 @@ php:
  - 5.5
  - 5.4
  - 5.3
-  - 5.2
--- a/Parsedown.php
+++ b/Parsedown.php
@ -50,7 +50,8 @@ class Parsedown
 		$text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
 		
 		# Removes \r characters.
-		$text = str_replace("\r", '', $text);
+		$text = str_replace("\r\n", "\n", $text);
+		$text = str_replace("\r", "\n", $text);
 		
 		# Replaces tabs with spaces.
 		$text = str_replace("\t", '    ', $text);
@ -74,21 +75,14 @@ class Parsedown
 			}
 		}
 		
-		# Extracts link references.
-		
-		if (preg_match_all('/^[ ]{0,3}\[(.+)\][ ]?:[ ]*\n?[ ]*(.+)$/m', $text, $matches, PREG_SET_ORDER))
-		{
-			foreach ($matches as $matches)
-			{
-				$this->reference_map[$matches[1]] = $matches[2];
-				
-				$text = str_replace($matches[0], '', $text);
-			}
-		}
-		
 		# ~ 
 		
-		$text = $this->parse_blocks($text);
+		$text = preg_replace('/\n\s*\n/', "\n\n", $text);
+		$text = trim($text, "\n");
+		
+		$lines = explode("\n", $text);
+		
+		$text = $this->parse_block_elements($lines);
 		
 		# Decodes escape sequences (leaves out backslashes).
 		
@ -106,379 +100,416 @@ class Parsedown
 	# Private Methods 
 	# 
 	
-	private function parse_blocks($text)
+	private function parse_block_elements(array $lines, $context = '')
 	{
-		# Divides text into blocks.
-		$blocks = preg_split('/\n\s*\n/', $text, -1, PREG_SPLIT_NO_EMPTY);
+		$elements = array();
 		
-		# Makes sure compound blocks get rendered.
-		$blocks []= NULL;
+		$element = array(
+			'type' => '',
+		);
 		
-		$markup = '';
-		
-		# Parses blocks.
-		
-		foreach ($blocks as $block)
+		foreach ($lines as $line)
 		{
-			if (isset($block) and $block[0] >= 'A')
-			{
-				$quick_block = $block;
+			# Block-Level HTML 
 			
-				unset($block);
+			if ($element['type'] === 'block' and ! isset($element['closed']))
+			{
+				if (preg_match('{<'.$element['subtype'].'>$}', $line)) # <open>
+				{
+					$element['depth']++;
 				}
 				
-			# List 
-			
-			if (isset($block) and preg_match('/^([ ]{0,3})(\d+[.]|[*+-])[ ]/', $block, $matches)) # list item
+				if (preg_match('{</'.$element['subtype'].'>$}', $line)) # </close>
 				{
-				if (isset($list)) # subsequent 
-				{
-					$list .= "\n\n".$block;
-				}
-				else # first 
-				{
-					$list = $block;
-					$list_indentation = strlen($matches[1]);
-					
-					list($list_type, $list_marker_pattern) = ($matches[2] === '-' or $matches[2] === '+' or $matches[2] === '*')
-						? array('ul', '[*+-]')
-						: array('ol', '\d+[.]');
+					$element['depth'] > 0 
+						? $element['depth']-- 
+						: $element['closed'] = true;
 				}
 				
-				unset($block);
+				$element['text'] .= "\n".$line;
+
+				continue;
 			}
-			elseif (isset($block) and isset($list) and $block[0] === ' ') # list item block 
-			{
-				$list .= "\n\n".$block;
 			
-				unset($block);
+			# Empty 
+			
+			if ($line === '')
+			{
+				$element['interrupted'] = true;
+				
+				continue;
 			}
-			elseif (isset($list))
-			{
-				$markup .= '<'.$list_type.'>'."\n";
 			
-				# Of the same type and indentation.
-				$list_items = preg_split('/^([ ]{'.$list_indentation.'})'.$list_marker_pattern.'[ ]/m', $list, -1, PREG_SPLIT_NO_EMPTY);
+			# Lazy Blockquote 
 			
-				foreach ($list_items as $list_item)
+			if ($element['type'] === 'blockquote' and ! isset($element['interrupted']))
 			{
-					$markup .= '<li>';
+				$line = preg_replace('/^[ ]*>[ ]?/', '', $line);
 				
-					if (strpos($list_item, "\n\n")) # sparse 
-					{
-						$list_item = trim($list_item, "\n");
+				$element['lines'] []= $line;
 				
-						if (strpos($list_item, "\n\n"))
+				continue;
+			}
+			
+			# Lazy List Item 
+			
+			if ($element['type'] === 'li')
 			{
-							$list_item = preg_replace('/^[ ]{0,4}/m', '', $list_item);
-							$list_item = $this->parse_blocks($list_item);
+				if (preg_match('/^([ ]{0,3})(\d+[.]|[*+-])[ ](.*)/', $line, $matches))
+				{
+					if ($element['indentation'] !== $matches[1]) 
+					{
+						$element['lines'] []= $line;
 					}
 					else 
 					{
-							$list_item = $this->parse_lines($list_item);
+						unset($element['last']);
+						
+						$elements []= $element;
+						
+						$element = array(
+							'type' => 'li',
+							'indentation' => $matches[1],
+							'last' => true,
+							'lines' => array(
+								preg_replace('/^[ ]{0,4}/', '', $matches[3]),
+							),
+						);
 					}
 					
-						$markup .= "\n".$list_item;
+					continue;
 				}
-					else # dense 
+				
+				if (isset($element['interrupted']))
 				{
-						$list_item = trim($list_item, "\n");
-
-						$list_item = strpos($list_item, "\n")
-							? $this->parse_lines($list_item)
-							: $this->parse_inline_elements($list_item);
-						
-						$markup .= $list_item;
-					}
-					
-					$markup .= '</li>'."\n";
-				}
-				
-				$markup .= '</'.$list_type.'>'."\n";
-				
-				unset($list);
-			}
-			
-			# Code Block 
-			
-			if (isset($block) and strlen($block) > 4 and $block[0] === ' ' and $block[1] === ' ' and $block[2] === ' ' and $block[3] === ' ')
+					if ($line[0] === ' ')
 					{
-				if (isset($code_block))
-				{
-					$code_block .= "\n\n".$block;
+						$element['lines'] []= '';
+						
+						$line = preg_replace('/^[ ]{0,4}/', '', $line);;
+						
+						$element['lines'] []= $line;
+						
+						continue;
+					}
 				}
 				else
 				{
-					$code_block = $block;
+					$line = preg_replace('/^[ ]{0,4}/', '', $line);;
+						
+					$element['lines'] []= $line;
+					
+					continue;
+				}
 			}
 			
-				unset($block);
-			}
-			elseif (isset($code_block))
+			# Quick Paragraph 
+			
+			if ($line[0] >= 'A' and $line[0] !== '_' and $line[0] !== '[')
 			{
-				$code_block_text = preg_replace('/^[ ]{4}/m', '', $code_block);
-				$code_block_text = htmlentities($code_block_text, ENT_NOQUOTES);
-				
-				# Decodes encoded escape sequences if present.
-				strpos($code_block_text, "\x1A\\") !== FALSE and $code_block_text = strtr($code_block_text, $this->escape_sequence_map);
-				
-				$markup .= '<pre><code>'.$code_block_text.'</code></pre>'."\n";
-				
-				unset($code_block);
+				goto paragraph; # trust me 
 			}
 			
-			# Atx Heading 
+			# Code 
 			
-			if (isset($block) and $block[0] === '#' and preg_match('/^(#{1,6})[ ]*(.+?)[ ]*#*$/', $block, $matches))
+			if ($line[0] === ' ' and preg_match('/^[ ]{4}(.*)/', $line, $matches))
 			{
+				if ($element['type'] === 'code')
+				{
+					isset($element['interrupted']) and $element['text'] .= "\n";
+					
+					$element['text'] .= "\n".$matches[1];
+				}
+				else
+				{
+					$elements []= $element;
+					
+					$element = array(
+						'type' => 'code',
+						'text' => $matches[1],
+					);
+				}
+
+				continue;
+			}
+			
+			# Setext Header (---)  
+			
+			if ($line[0] === '-' and $element['type'] === 'p' and ! isset($element['interrupted']) and preg_match('/^[-]+[ ]*$/', $line))
+			{
+				$element['type'] = 'h.';
+				$element['level'] = 2;
+				
+				continue;
+			}
+			
+			# Atx Header (#)
+			
+			if ($line[0] === '#' and preg_match('/^(#{1,6})[ ]*(.+?)[ ]*#*$/', $line, $matches))
+			{
+				$elements []= $element;
+				
 				$level = strlen($matches[1]);
 				
-				$heading = $this->parse_inline_elements($matches[2]);
-				
-				$markup .= '<h'.$level.'>'.$heading.'</h'.$level.'>'."\n";
+				$element = array(
+					'type' => 'h.',
+					'text' => $matches[2],
+					'level' => $level,
+				);
 				
 				continue;
 			}
 			
-			# Quote Block 
+			# Setext Header (===) 
 			
-			if (isset($block) and preg_match('/^[ ]{0,3}>/', $block))
+			if ($line[0] === '=' and $element['type'] === 'p' and ! isset($element['interrupted']) and preg_match('/^[=]+[ ]*$/', $line))
 			{
-				$block = preg_replace('/^[ ]{0,3}>[ ]?/m', '', $block);
-				$block = $this->parse_blocks($block);
-				
-				$markup .= '<blockquote>'."\n".$block.'</blockquote>'."\n";
-				
-				continue;
-			}
-			
-			# Horizontal Line 
-			
-			if (isset($block) and preg_match('/^[ ]{0,3}([-*_])([ ]{0,2}\1){2,}$/', $block))
-			{
-				$markup .= '<hr />'."\n";
+				$element['type'] = 'h.';
+				$element['level'] = 1;
 				
 				continue;
 			}
 			
 			# ~ 
 			
-			if (isset($quick_block))
-			{
-				$block = $quick_block;
+			$pure_line = ltrim($line);
 			
-				unset ($quick_block);
+			# Link Reference 
+			
+			if ($pure_line[0] === '[' and preg_match('/^\[(.+?)\]:[ ]*([^ ]+)/', $pure_line, $matches))
+			{
+				$label = $matches[1];
+				$url = trim($matches[2], '<>');
+				
+				$this->reference_map[$label] = $url;
+				
+				continue;
 			}
 			
-			# 
-			# Paragraph 
+			# Blockquote 
 			
-			if (isset($block))
+			if ($pure_line[0] === '>' and preg_match('/^>[ ]?(.*)/', $pure_line, $matches))
 			{
-				if (strpos($block, "\n"))
+				if ($element['type'] === 'blockquote')
 				{
-					$markup .= $this->parse_lines($block);
+					if (isset($element['interrupted']))
+					{
+						$element['lines'] []= '';
+						
+						unset($element['interrupted']);
+					}
+					
+					$element['lines'] []= $matches[1];
 				}
 				else
 				{
-					$element_text = $this->parse_inline_elements($block);
-					$element = '<p>'.$element_text.'</p>'."\n";
+					$elements []= $element;
 					
-					$markup .= $element;
-				}
-			}
+					$element = array(
+						'type' => 'blockquote',
+						'lines' => array(
+							$matches[1],
+						),
+					);
 				}
 				
-		return $markup;
+				continue;
 			}
 			
-	private function parse_lines($text)
+			# HTML  
+			
+			if ($pure_line[0] === '<')
 			{
-		$text = trim($text, "\n");
+				# Block-Level HTML <self-closing/>

-		$lines = explode("\n", $text);
+				if (preg_match('{^<.+?/>$}', $pure_line))
+				{
+					$elements []= $element;

-		$lines []= NULL;
+					$element = array(
+						'type' => '',
+						'text' => $pure_line,
+					);
+
+					continue;
+				}
+				
+				# Block-Level HTML <open>
+
+				if (preg_match('{^<(\w+)(?:[ ].*?)?>}', $pure_line, $matches))
+				{
+					$elements []= $element;
+
+					$element = array(
+						'type' => 'block',
+						'subtype' => strtolower($matches[1]),
+						'text' => $pure_line,
+						'depth' => 0,
+					);
+					
+					preg_match('{</'.$matches[1].'>\s*$}', $pure_line) and $element['closed'] = true;
+
+					continue;
+				}
+			}
+			
+			# Horizontal Rule   
+			
+			if (preg_match('/^([-*_])([ ]{0,2}\1){2,}[ ]*$/', $pure_line))
+			{
+				$elements []= $element;
+				
+				$element = array(
+					'type' => 'hr',
+				);
+				
+				continue;
+			}
+			
+			# List Item 
+			
+			if (preg_match('/^([ ]*)(\d+[.]|[*+-])[ ](.*)/', $line, $matches))
+			{
+				$elements []= $element;
+				
+				$element = array(
+					'type' => 'li',
+					'ordered' => isset($matches[2][1]),
+					'indentation' => $matches[1],
+					'last' => true,
+					'lines' => array(
+						preg_replace('/^[ ]{0,4}/', '', $matches[3]),
+					),
+				);
+				
+				continue;
+			}
+			
+			# ~ 
+			
+			paragraph:
+			
+			if ($element['type'] === 'p')
+			{
+				if (isset($element['interrupted']))
+				{
+					$elements []= $element;
+					
+					$element['text'] = $line;
+					
+					unset($element['interrupted']);
+				}
+				else
+				{
+					$element['text'] .= "\n".$line;
+				}
+			}
+			else
+			{
+				$elements []= $element;
+				
+				$element = array(
+					'type' => 'p', 
+					'text' => $line,
+				);
+			}
+		}
+		
+		$elements []= $element;
+		
+		array_shift($elements);
+		
+		# 
+		# ~ 
+		# 
 		
 		$markup = '';
 		
-		foreach ($lines as $line)
+		foreach ($elements as $index => $element)
 		{
-			if (isset($line) and $line === '')
+			switch ($element['type'])
 			{
-				unset($line);
-			}
+				case 'li':
 					
-			# Paragraph 
-			
-			if (isset($line) and $line[0] >= 'A')
+					if (isset($element['ordered'])) # first
 					{
-				$quick_line = $line;
-				
-				unset($line);
-			}
-			
-			# List 
-			
-			if (isset($line) and preg_match('/^([ ]*)(\d+[.]|[*+-])[ ](.*)/', $line, $matches)) # list item 
-			{
-				$list_item_indentation = strlen($matches[1]);
-				$list_item_type = ($matches[2] === '-' or $matches[2] === '+' or $matches[2] === '*')
-					? 'ul'
-					: 'ol';
-				
-				if (isset($list)) # subsequent 
-				{
-					if ($list_item_indentation === $list_indentation and $list_item_type === $list_type)
-					{
-						# Adds last list item to the list.
-						$list []= $list_item;
-						
-						# Creates a separate list item.
-						$list_item = $matches[3];
-					}
-					else 
-					{
-						# Adds line to the current list item.
-						$list_item .= "\n".$line;
-					}
-				}
-				else # first 
-				{
-					$list = array();
-					$list_indentation = $list_item_indentation;
-					$list_type = $list_item_type;
-					
-					$list_item = $matches[3];
-				}
-				
-				unset($line);
-			}
-			else 
-			{
-				if (isset($list))
-				{
-					$list []= $list_item;
+						$list_type = $element['ordered'] ? 'ol' : 'ul';
 						
 						$markup .= '<'.$list_type.'>'."\n";
-					
-					foreach ($list as $list_item)
-					{
-						$list_item_text = strpos($list_item, "\n") 
-							? $this->parse_lines($list_item)
-							: $this->parse_inline_elements($list_item);
-						
-						$markup .= '<li>'.$list_item_text.'</li>'."\n";
 					}
 					
-					$markup .= '</'.$list_type.'>'."\n";
-					
-					unset($list);
-				}
+					if (isset($element['interrupted']) and ! isset($element['last']))
+					{
+						$element['lines'] []= '';
 					}
 					
-			# Quote Block 
+					$text = $this->parse_block_elements($element['lines'], 'li');
 					
-			if (isset($line) and preg_match('/^[ ]*>[ ]?(.*)/', $line, $matches))
+					$markup .= '<li>'.$text.'</li>'."\n";
+					
+					isset($element['last']) and $markup .= '</'.$list_type.'>'."\n";
+					
+					break;
+				
+				case 'p':
+					
+					$text = $this->parse_inline_elements($element['text']);
+					
+					$text = preg_replace('/[ ]{2}\n/', '<br />'."\n", $text);
+					
+					if ($context === 'li' and $index === 0)
 					{
-				if (isset($quote))
+						if (isset($element['interrupted']))
 						{
-					$quote .= "\n".$matches[1];
+							$markup .= "\n".'<p>'.$text.'</p>'."\n";
 						}
 						else 
 						{
-					$quote = $matches[1];
-				}
-				
-				unset($line);
-			}
-			else 
-			{
-				if (isset($quote))
-				{
-					$quote = $this->parse_blocks($quote);
-					
-					$markup .= '<blockquote>'."\n".$quote.'</blockquote>'."\n";
-					
-					unset($quote);
-				}
-			}
-			
-			# Atx Heading 
-			
-			if (isset($atx_heading))
-			{
-				$markup .= '<h'.$atx_heading_level.'>'.$atx_heading.'</h'.$atx_heading_level.'>'."\n";
-				
-				unset($atx_heading);
-			}
-			
-			if (isset($line) and $line[0] === '#' and preg_match('/^(#{1,6})[ ]*(.+?)[ ]*#*$/', $line, $matches))
-			{
-				$atx_heading_level = strlen($matches[1]);
-				
-				$atx_heading = $this->parse_inline_elements($matches[2]);
-				
-				unset($line);
-			}
-			
-			# Setext Heading 
-			
-			if (isset($line) and isset($paragraph))
-			{
-				$setext_characters = array('=', '-');
-				
-				foreach ($setext_characters as $index => $setext_character)
-				{
-					if ($line[0] === $setext_character and preg_match('/^['.$setext_character.']+[ ]*$/', $line))
-					{
-						$setext_heading_level = $index + 1;
-						
-						$setext_heading_text = $this->parse_inline_elements($paragraph);
-						
-						$markup .= '<h'.$setext_heading_level.'>'.$setext_heading_text.'</h'.$setext_heading_level.'>'."\n";
-						
-						unset($paragraph, $line);
-						
-						continue 2;
-					}
-				}
-			}
-			
-			# Paragraph 
-			
-			if (isset($quick_line))
-			{
-				$line = $quick_line;
-				
-				unset($quick_line);
-			}
-			
-			if (isset($line))
-			{
-				substr($line, -2) === '  '
-					and $line = substr($line, 0, -2)
-					and $line .= '<br/>';
-				
-				if (isset($paragraph))
-				{
-					$paragraph .= "\n".$line;
-				}
-				else 
-				{
-					$paragraph = $line;
+							$markup .= $text;
 						}
 					}
 					else 
 					{
-				if (isset($paragraph))
-				{
-					$element_text = $this->parse_inline_elements($paragraph);
-					
-					$markup .= '<p>'.$element_text.'</p>'."\n";
-					
-					unset($paragraph);
+						$markup .= '<p>'.$text.'</p>'."\n";
 					}
+					
+					break;
+				
+				case 'code':
+					
+					$text = htmlentities($element['text'], ENT_NOQUOTES);
+					
+					strpos($text, "\x1A\\") !== FALSE and $text = strtr($text, $this->escape_sequence_map);
+					
+					$markup .= '<pre><code>'.$text.'</code></pre>'."\n";
+					
+					break;
+				
+				case 'blockquote':
+					
+					$text = $this->parse_block_elements($element['lines']);
+					
+					$markup .= '<blockquote>'."\n".$text.'</blockquote>'."\n";
+					
+					break;
+				
+				case 'h.':
+					
+					$text = $this->parse_inline_elements($element['text']);
+					
+					$markup .= '<h'.$element['level'].'>'.$text.'</h'.$element['level'].'>'."\n";
+					
+					break;
+				
+				case 'hr':
+					
+					$markup .= '<hr />'."\n";
+					
+					break;
+
+				default:
+					
+					$markup .= $element['text']."\n";
 			}
 		}
 		
@ -522,15 +553,53 @@ class Parsedown
 			}
 		}
 		
-		# Reference(d) Link / Image 
+		# Inline Link / Image 
 		
-		if ($this->reference_map and strpos($text, '[') !== FALSE and preg_match_all('/(!?)\[(.+?)\][ ]?\[(.+?)\]/', $text, $matches, PREG_SET_ORDER))
+		if (strpos($text, '](') !== FALSE and preg_match_all('/(!?)(\[((?:[^][]+|(?2))*)\])\((.*?)\)/', $text, $matches, PREG_SET_ORDER)) # inline 
 		{
 			foreach ($matches as $matches)
 			{
-				if (array_key_exists($matches[3], $this->reference_map))
+				$url = $this->escape_special_characters($matches[4]);
+				
+				if ($matches[1]) # image 
 				{
-					$url = $this->reference_map[$matches[3]];
+					$element = '<img alt="'.$matches[3].'" src="'.$url.'">';
+				}
+				else
+				{
+					$element_text = $this->parse_inline_elements($matches[3]);
+					
+					$element = '<a href="'.$url.'">'.$element_text.'</a>';
+				}
+				
+				# ~ 
+
+				$code = "\x1A".'$'.$index;
+
+				$text = str_replace($matches[0], $code, $text);
+
+				$map[$code] = $element;
+				
+				$index ++;
+			}
+		}
+		
+		# Reference(d) Link / Image 
+		
+		if ($this->reference_map and strpos($text, '[') !== FALSE and preg_match_all('/(!?)\[(.+?)\](?:\n?[ ]?\[(.*?)\])?/ms', $text, $matches, PREG_SET_ORDER))
+		{
+			foreach ($matches as $matches)
+			{
+				$link_definition = isset($matches[3]) && $matches[3]
+					? $matches[3]
+					: $matches[2]; # implicit 
+				
+				$link_definition = strtolower($link_definition);
+				
+				if (isset($this->reference_map[$link_definition]))
+				{
+					$url = $this->reference_map[$link_definition];
+					$url = $this->escape_special_characters($url);
 					
 					if ($matches[1]) # image 
 					{
@ -556,44 +625,17 @@ class Parsedown
 			}
 		}
 		
-		# Inline Link / Image 
-		
-		if (strpos($text, '](') !== FALSE and preg_match_all('/(!?)(\[((?:[^][]+|(?2))*)\])\((.*?)\)/', $text, $matches, PREG_SET_ORDER)) # inline 
-		{
-			foreach ($matches as $matches)
-			{
-				if ($matches[1]) # image 
-				{
-					$element = '<img alt="'.$matches[3].'" src="'.$matches[4].'">';
-				}
-				else 
-				{
-					$element_text = $this->parse_inline_elements($matches[3]);
-					
-					$element = '<a href="'.$matches[4].'">'.$element_text.'</a>';
-				}
-				
-				$element_text = $this->parse_inline_elements($matches[1]);
-				
-				# ~ 
-
-				$code = "\x1A".'$'.$index;
-
-				$text = str_replace($matches[0], $code, $text);
-
-				$map[$code] = $element;
-				
-				$index ++;
-			}
-		}
+		# Automatic Links 
 		
 		if (strpos($text, '<') !== FALSE and preg_match_all('/<((https?|ftp|dict):[^\^\s]+?)>/i', $text, $matches, PREG_SET_ORDER))
 		{
 			foreach ($matches as $matches)
 			{
+				$url = $this->escape_special_characters($matches[1]);
+				
 				$element = '<a href=":href">:text</a>';
-				$element = str_replace(':text', $matches[1], $element);
-				$element = str_replace(':href', $matches[1], $element);
+				$element = str_replace(':text', $url, $element);
+				$element = str_replace(':href', $url, $element);
 				
 				# ~ 
 				
@ -607,15 +649,35 @@ class Parsedown
 			}
 		}
 		
-		if (strpos($text, '*') !== FALSE or strpos($text, '_') !== FALSE)
+		# ~ 
+		
+		$text = $this->escape_special_characters($text);
+
+		# ~ 
+		
+		if (strpos($text, '_') !== FALSE)
 		{
-			$text = preg_replace('/(\*\*|__)(.+?[*_]*)(?<=\S)\1/', '<strong>$2</strong>', $text);
-			$text = preg_replace('/(\*|_)(.+?)(?<=\S)\1/', '<em>$2</em>', $text);
+			$text = preg_replace('/__(?=\S)(.+?)(?<=\S)__/', '<strong>$1</strong>', $text);
+			$text = preg_replace('/_(?=\S)(.+?)(?<=\S)_/', '<em>$1</em>', $text);
+		}
+		
+		if (strpos($text, '*') !== FALSE)
+		{
+			$text = preg_replace('/\*\*(?=\S)(.+?)(?<=\S)\*\*/', '<strong>$1</strong>', $text);
+			$text = preg_replace('/\*(?=\S)(.+?)(?<=\S)\*/', '<em>$1</em>', $text);
 		}
 		
 		$text = strtr($text, $map);
 		
 		return $text;
 	}
-}
 	
+	private function escape_special_characters($text)
+	{
+		strpos($text, '&') !== FALSE and $text = preg_replace('/&(?!#?\w+;)/', '&amp;', $text);
+		
+		$text = str_replace('<', '&lt;', $text);
+		
+		return $text;
+	}
+}
--- a/tests/Test.php
+++ b/tests/Test.php
@ -34,6 +34,8 @@ class Test extends PHPUnit_Framework_TestCase
 					continue;
 				
 				$expected_markup = file_get_contents(__DIR__ . '/' . self::provider_dir . $basename . '.html');
+				$expected_markup = str_replace("\r\n", "\n", $expected_markup);
+				$expected_markup = str_replace("\r", "\n", $expected_markup);
 				
 				$provider [] = array($markdown, $expected_markup);
 			}
--- a/tests/data/atx_heading.html
+++ b/tests/data/atx_heading.html
@ -4,11 +4,3 @@
 <h4>This is an h4</h4>
 <h5>This is an h5</h5>
 <h6>This is an h6</h6>
-<h1>This is a closed h1</h1>
-<h2>This is a closed h2</h2>
-<h3>This is a closed h3</h3>
-<h4>This is a closed h4</h4>
-<h5>This is a closed h5</h5>
-<h6>This is a closed h6</h6>
-<h1>This is an irregularly closed h1</h1>
-<h4>This is an irregularly closed h4</h4>
--- a/tests/data/atx_heading.md
+++ b/tests/data/atx_heading.md
@ -9,19 +9,3 @@
 ##### This is an h5

 ###### This is an h6
-
-# This is a closed h1 #
-
-## This is a closed h2 ##
-
-### This is a closed h3 ###
-
-#### This is a closed h4 ####
-
-##### This is a closed h5 #####
-
-###### This is a closed h6 ######
-
-# This is an irregularly closed h1 ###
-
-#### This is an irregularly closed h4 ##
--- a/tests/data/blockquote.html
+++ b/tests/data/blockquote.html
@ -1,25 +1,13 @@
 <p>Here's a regular blockquote:</p>
 <blockquote>
-<p>This is a blockquote.</p>
+<p>blockquote</p>
 </blockquote>
 <p>Here's one with no space after the ">":</p>
 <blockquote>
-<p>This is a blockquote.</p>
+<p>blockquote</p>
 </blockquote>
-<p>Here's one with multiple paragraphs:</p>
+<p>Here's one on multiple lines:</p>
 <blockquote>
-<p>This is line one.</p>
-<p>This is line two.</p>
-</blockquote>
-<p>Here's one with multiple types of blocks:</p>
-<blockquote>
-<p>This is a quoted paragraph.</p>
-<ul>
-<li>This is a list item of a quoted list.</li>
-<li>This is another list item.</li>
-</ul>
-<blockquote>
-<p>This is a nested quote block.</p>
-</blockquote>
-<p>This is another paragraph.</p>
+<p>line 1
+line 2</p>
 </blockquote>
--- a/tests/data/blockquote.md
+++ b/tests/data/blockquote.md
@ -1,24 +1,12 @@
 Here's a regular blockquote:

-> This is a blockquote.
+> blockquote

 Here's one with no space after the ">":

->This is a blockquote.
+>blockquote

-Here's one with multiple paragraphs:
+Here's one on multiple lines:

-> This is line one.
->
-> This is line two.
-
-Here's one with multiple types of blocks:
-
-> This is a quoted paragraph.
-> 
-> - This is a list item of a quoted list.
-> - This is another list item.
->
-> > This is a nested quote block.
-> 
-> This is another paragraph.
+> line 1
+> line 2
--- a/tests/data/closed_atx_heading.html
+++ b/tests/data/closed_atx_heading.html
@ -0,0 +1,6 @@
+<h1>h1</h1>
+<h2>h2</h2>
+<h3>h3</h3>
+<h4>h4</h4>
+<h5>h5</h5>
+<h6>h6</h6>
--- a/tests/data/closed_atx_heading.md
+++ b/tests/data/closed_atx_heading.md
@ -0,0 +1,11 @@
+# h1 #
+
+## h2 ##
+
+### h3 ###
+
+#### h4 ####
+
+##### h5 #####
+
+###### h6 ######
--- a/tests/data/compound_blockquote.html
+++ b/tests/data/compound_blockquote.html
@ -0,0 +1,16 @@
+<p>Here's one with multiple paragraphs:</p>
+<blockquote>
+<p>This is line one.</p>
+<p>This is line two.</p>
+</blockquote>
+<p>Here's one with multiple types of blocks:</p>
+<blockquote>
+<p>This is a quoted paragraph.</p>
+<ul>
+<li>This is a list item of a quoted list.</li>
+<li>This is another list item.</li>
+</ul>
+<blockquote>
+<p>This is a nested quote block.</p>
+</blockquote>
+</blockquote>
--- a/tests/data/compound_blockquote.md
+++ b/tests/data/compound_blockquote.md
@ -0,0 +1,14 @@
+Here's one with multiple paragraphs:
+
+> This is line one.
+>
+> This is line two.
+
+Here's one with multiple types of blocks:
+
+> This is a quoted paragraph.
+> 
+> - This is a list item of a quoted list.
+> - This is another list item.
+>
+> > This is a nested quote block.
--- a/tests/data/cyrillic.html
+++ b/tests/data/cyrillic.html
--- a/tests/data/cyrillic.md
+++ b/tests/data/cyrillic.md
--- a/tests/data/regular_list.html
+++ b/tests/data/regular_list.html
@ -15,3 +15,10 @@
 <p>Here's one with no space after markers:</p>
 <p>-list item
 -another list item</p>
+<p>Here's one where items contain line breaks:</p>
+<ul>
+<li>list
+item</li>
+<li>another
+list item</li>
+</ul>
--- a/tests/data/regular_list.md
+++ b/tests/data/regular_list.md
@ -18,3 +18,10 @@ Here's one with no space after markers:

 -list item
 -another list item
+
+Here's one where items contain line breaks:
+
+- list
+item
+- another
+list item
--- a/tests/data/email.html
+++ b/tests/data/email.html
--- a/tests/data/email.md
+++ b/tests/data/email.md
--- a/tests/data/emphasis.html
+++ b/tests/data/emphasis.html
@ -3,6 +3,5 @@
 <p>Here's <strong>a strong one</strong>. </p>
 <p>Here's <em>an emphasis that uses underscores</em>. </p>
 <p>Here's <strong>a strong emphasis that uses underscores</strong>.</p>
-<p>This is _ not an emphasis _ neither is * that * .</p>
+<p>This is not _ an emphasis _ neither is * this * neither is _ this_ neither is _this _.</p>
 <p>Empty emphasis ** is not __ an emphasis.</p>
-<p>Three asterisks are an emphasized asterisk <em>*</em> .</p>
--- a/tests/data/emphasis.md
+++ b/tests/data/emphasis.md
@ -8,8 +8,6 @@ Here's _an emphasis that uses underscores_.

 Here's __a strong emphasis that uses underscores__.

-This is _ not an emphasis _ neither is * that * .
+This is not _ an emphasis _ neither is * this * neither is _ this_ neither is _this _.

 Empty emphasis ** is not __ an emphasis.
-
-Three asterisks are an emphasized asterisk *** .
--- a/tests/data/horizontal_rule.html
+++ b/tests/data/horizontal_rule.html
@ -0,0 +1,16 @@
+<p>Dashes:</p>
+<hr />
+<hr />
+<hr />
+<hr />
+<pre><code>---</code></pre>
+<hr />
+<hr />
+<hr />
+<hr />
+<pre><code>- - -</code></pre>
+<p>Asterisks:</p>
+<hr />
+<p>Underscores:</p>
+<hr />
+<p>Based on <a href="http://daringfireball.net/projects/downloads/MarkdownTest_1.0.zip">the original</a> test suite.</p>
--- a/tests/data/horizontal_rule.md
+++ b/tests/data/horizontal_rule.md
@ -0,0 +1,31 @@
+Dashes:
+
+---
+
+ ---
+ 
+  ---
+
+   ---
+
+	---
+
+- - -
+
+ - - -
+ 
+  - - -
+
+   - - -
+
+	- - -
+
+Asterisks:
+
+***
+
+Underscores:
+
+___
+
+Based on [the original](http://daringfireball.net/projects/downloads/MarkdownTest_1.0.zip) test suite.
--- a/tests/data/html.html
+++ b/tests/data/html.html
@ -0,0 +1,15 @@
+<p>Self-closing tag:</p>
+<hr/>
+<p>Self-closing tag with attributes:</p>
+<hr style="background: #eaa" />
+<p>Bare element:</p>
+<div>content</div>
+<p>Element with attributes:</p>
+<a href="http://parsedown.org">link</a>
+<p>Nested elements:</p>
+<div>
+parent
+<div>
+child
+</div>
+</div>
--- a/tests/data/html.md
+++ b/tests/data/html.md
@ -0,0 +1,24 @@
+Self-closing tag:
+
+<hr/>
+
+Self-closing tag with attributes:
+
+<hr style="background: #eaa" />
+
+Bare element:
+
+<div>content</div>
+
+Element with attributes:
+
+<a href="http://parsedown.org">link</a>
+
+Nested elements:
+
+<div>
+parent
+<div>
+child
+</div>
+</div>
--- a/tests/data/lazy_blockquote.html
+++ b/tests/data/lazy_blockquote.html
@ -0,0 +1,4 @@
+<blockquote>
+<p>line 1
+line 2</p>
+</blockquote>
--- a/tests/data/lazy_blockquote.md
+++ b/tests/data/lazy_blockquote.md
@ -0,0 +1,2 @@
+> line 1
+line 2
--- a/tests/data/lazy_list_item.html
+++ b/tests/data/lazy_list_item.html
@ -0,0 +1,4 @@
+<ul>
+<li>li
+more text</li>
+</ul>
--- a/tests/data/lazy_list_item.md
+++ b/tests/data/lazy_list_item.md
@ -0,0 +1,2 @@
+- li
+more text
--- a/tests/data/line_break.html
+++ b/tests/data/line_break.html
@ -0,0 +1,2 @@
+<p>line<br />
+line</p>
--- a/tests/data/line_break.md
+++ b/tests/data/line_break.md
@ -0,0 +1,2 @@
+line  
+line
--- a/tests/data/paragraph_blockquote.html
+++ b/tests/data/paragraph_blockquote.html
@ -0,0 +1,4 @@
+<p>Here's a paragraph.</p>
+<blockquote>
+<p>a block quote that belongs to it.</p>
+</blockquote>
--- a/tests/data/paragraph_blockquote.md
+++ b/tests/data/paragraph_blockquote.md
@ -0,0 +1,2 @@
+Here's a paragraph.
+> a block quote that belongs to it.
--- a/tests/data/reference_link.html
+++ b/tests/data/reference_link.html
@ -1,8 +1,11 @@
 <p>Here's a <a href="http://parsedown.org">reference link</a>.</p>
-<p>Here's <a href="http://parsedown.org">one</a> with an alternative syntax.</p>
 <p>Here's <a href="http://parsedown.org">one</a> on the next line.</p>
-<p>Here's <a href="http://parsedown.org">one</a> on 2 lines.</p>
 <p>Here's <a href="http://parsedown.org/tests/">one</a> with a different URL.</p>
 <p>Here's <a href="http://parsedown.org">one</a> with a semantic name.</p>
+<p>Here's <a href="http://parsedown.org">one</a> with definition name on the next line.</p>
 <p>Here's [one][404] with no definition.</p>
-<p>Here's an image: <img alt="Markdown Logo" src="https://raw.github.com/dcurtis/markdown-mark/master/png/32x20-solid.png"></p>
+<p>Here's an image: <img alt="Markdown Logo" src="/md.png"></p>
+<p>Here's an <a href="http://google.com">implicit one</a>.</p>
+<p>Here's an <a href="http://google.com">implicit one</a> with an empty link definition.</p>
+<p>Here's a <a href="http://parsedown.org">multiline
+one</a> defined on 2 lines.</p>
--- a/tests/data/reference_link.md
+++ b/tests/data/reference_link.md
@ -2,28 +2,31 @@ Here's a [reference link][1].

 [1]: http://parsedown.org

-Here's [one] [2] with an alternative syntax.
-
+Here's [one][2] on the next line.
 [2]: http://parsedown.org

-Here's [one][3] on the next line.
-[3]: http://parsedown.org
+Here's [one][3] with a different URL.

-Here's [one][4] on 2 lines.
+[3]: http://parsedown.org/tests/

-[4]:
-http://parsedown.org
+Here's [one][website] with a semantic name.

-Here's [one][5] with a different URL.
+[website]: http://parsedown.org

-[5]: http://parsedown.org/tests/
-
-Here's [one][the website] with a semantic name.
-
-[the website]: http://parsedown.org
+Here's [one]
+[website] with definition name on the next line.

 Here's [one][404] with no definition.

 Here's an image: ![Markdown Logo][image]

-[image]: https://raw.github.com/dcurtis/markdown-mark/master/png/32x20-solid.png
+[image]: /md.png
+
+Here's an [implicit one].
+
+[implicit one]: http://google.com
+
+Here's an [implicit one][] with an empty link definition.
+
+Here's a [multiline
+one][website] defined on 2 lines.
--- a/tests/data/setext_header.html
+++ b/tests/data/setext_header.html
@ -0,0 +1,5 @@
+<h1>h1</h1>
+<h2>h2</h2>
+<h2>single character</h2>
+<p>not a header</p>
+<hr />
--- a/tests/data/setext_header.md
+++ b/tests/data/setext_header.md
@ -0,0 +1,12 @@
+h1
+==
+
+h2
+--
+
+single character
+-
+
+not a header
+
+------------
--- a/tests/data/sparse_list.html
+++ b/tests/data/sparse_list.html
@ -1,14 +1,16 @@
-<p>Here's a list where items are separated by empty lines:</p>
+<p>Here's a sparse list:</p>
 <ul>
 <li>
 <p>list item</p>
 </li>
 <li>another list item</li>
 </ul>
-<p>Here's an ordered one:</p>
-<ol>
+<p>Here's one with an indented list item:</p>
+<ul>
 <li>
-<p>item one</p>
+<p>li</p>
+<ul>
+<li>li</li>
+</ul>
 </li>
-<li>item two</li>
-</ol>
+</ul>
--- a/tests/data/sparse_list.md
+++ b/tests/data/sparse_list.md
@ -1,11 +1,11 @@
-Here's a list where items are separated by empty lines:
+Here's a sparse list:

 - list item

 - another list item

-Here's an ordered one:
+Here's one with an indented list item:

-1. item one
+- li

-2. item two
+    - li
--- a/tests/data/special_characters.html
+++ b/tests/data/special_characters.html
@ -0,0 +1,8 @@
+<p>AT&amp;T has an ampersand in their name.</p>
+<p>AT&amp;T is another way to write it.</p>
+<p>This &amp; that.</p>
+<p>4 &lt; 5 and 6 > 5.</p>
+<p>Here's a <a href="http://example.com/?foo=1&amp;bar=2">link</a> with an ampersand in the URL.</p>
+<p>Here's an inline <a href="/script?foo=1&amp;bar=2">link</a>.</p>
+<hr />
+<p>Based on <a href="http://daringfireball.net/projects/downloads/MarkdownTest_1.0.zip">the original</a> test suite.</p>
--- a/tests/data/special_characters.md
+++ b/tests/data/special_characters.md
@ -0,0 +1,17 @@
+AT&T has an ampersand in their name.
+
+AT&amp;T is another way to write it.
+
+This & that.
+
+4 < 5 and 6 > 5.
+
+Here's a [link] [1] with an ampersand in the URL.
+
+Here's an inline [link](/script?foo=1&bar=2).
+
+[1]: http://example.com/?foo=1&bar=2
+
+---
+
+Based on [the original](http://daringfireball.net/projects/downloads/MarkdownTest_1.0.zip) test suite.
Author	SHA1	Message	Date
Emanuil Rusev	b12973415f	parse link references as blocks to improve performance	2013-11-05 00:57:16 +02:00
Emanuil Rusev	6d113f47fb	rearrange block types to optimize performance	2013-11-04 09:28:50 +02:00
Emanuil Rusev	d4d3612710	escaping for special characters	2013-11-03 17:32:45 +02:00
Emanuil Rusev	2e314ad474	resolve #24	2013-11-02 21:42:55 +02:00
Emanuil Rusev	e475602e2f	simplify parsing of code blocks	2013-11-02 02:18:13 +02:00
Emanuil Rusev	f43f54b877	remove redundant parse_inline_elements call	2013-10-23 00:50:32 +03:00
Emanuil Rusev	d733acc94e	add .idea to .gitignore	2013-10-23 00:44:21 +03:00
Emanuil Rusev	6a0695deb9	correct spelling of $link_definition	2013-10-13 22:52:36 +03:00
Emanuil	5dd40e7adf	add test for horizontal rule	2013-09-24 22:53:42 +03:00
Emanuil	b9808f23e0	setext underlines should not work on interrupted paragraphs	2013-09-24 22:36:24 +03:00
Emanuil	47b1789430	resolve #9	2013-09-24 02:32:58 +03:00
Emanuil	f8119fa3cb	separate compiling from parsing	2013-09-24 01:19:17 +03:00
Emanuil	d306ee3db5	improve tests	2013-09-24 01:09:13 +03:00
Emanuil	e15241cb92	remove incomplete tests	2013-09-24 01:00:20 +03:00
Emanuil	7ab71ade06	optimize parsing of rule	2013-09-20 02:12:06 +03:00
Emanuil	64f82e1e2a	inline links should get parsed before reference links	2013-09-20 01:12:40 +03:00
Emanuil	f40dbdfb65	variable names should express what they represent rather than why they represent it	2013-09-19 23:54:28 +03:00
Emanuil	033c2b78c1	match blockquote comment	2013-09-19 23:28:12 +03:00
Emanuil	34035316df	NULL » null	2013-09-19 23:12:48 +03:00
Emanuil	f13214cfa7	single line blockquotes should also go through "parse_lines"	2013-09-18 19:53:44 +03:00
Emanuil	238b1029c0	remove "parse_blocks" method in favor of a more capable "parse_lines"	2013-09-18 00:27:35 +03:00
Emanuil	bc27850c41	improve emphasis test	2013-09-03 00:15:25 +03:00
Emanuil	3afeee3b19	parse * and _ emphasis types separately to optimize performance and improve readability	2013-09-03 00:14:04 +03:00
Emanuil	a94a45f955	reference_link test should reference md.png with a relative path	2013-09-02 22:12:43 +03:00
Emanuil	4af89c5087	reference links should be able to have their names on the next line	2013-08-31 22:27:38 +03:00
Emanuil	0352f01c7e	leading \n characters should not be parsed as part of first block	2013-08-31 21:44:23 +03:00
Emanuil	40c2dcfac7	resolve #20	2013-08-31 20:28:23 +03:00
Emanuil	097ec5e8a5	test case should deal with \r characters	2013-08-31 20:11:48 +03:00
Emanuil	8ac52a2f30	resolve #17	2013-08-31 19:55:07 +03:00
Emanuil	4a6bb88239	improve the code that removes \r characters	2013-08-31 19:54:14 +03:00
Emanuil	609ad47c38	resolve #16	2013-07-26 00:08:52 +03:00
Emanuil	7d7e89f5c3	remove 5.2 from PHP versions to test against	2013-07-25 01:49:02 +03:00