patch 9.0.1771: regex: combining chars in collections not handled
Problem: regex: combining chars in collections not handled Solution: Check for following combining characters for NFA and BT engine closes: #10459 closes: #10286 Signed-off-by: Christian Brabandt <cb@256bit.org>
This commit is contained in:
		| @ -3743,13 +3743,38 @@ regmatch( | ||||
|  | ||||
| 	  case ANYOF: | ||||
| 	  case ANYBUT: | ||||
| 	    if (c == NUL) | ||||
| 		status = RA_NOMATCH; | ||||
| 	    else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF)) | ||||
| 		status = RA_NOMATCH; | ||||
| 	    else | ||||
| 		ADVANCE_REGINPUT(); | ||||
| 	    break; | ||||
| 	    { | ||||
| 		char_u  *q = OPERAND(scan); | ||||
|  | ||||
| 		if (c == NUL) | ||||
| 		    status = RA_NOMATCH; | ||||
| 		else if ((cstrchr(q, c) == NULL) == (op == ANYOF)) | ||||
| 		    status = RA_NOMATCH; | ||||
| 		else | ||||
| 		{ | ||||
| 		    // Check following combining characters | ||||
| 		    int	len = 0; | ||||
| 		    int i; | ||||
|  | ||||
| 		    if (enc_utf8) | ||||
| 			len = utfc_ptr2len(q) - utf_ptr2len(q); | ||||
|  | ||||
| 		    MB_CPTR_ADV(rex.input); | ||||
| 		    MB_CPTR_ADV(q); | ||||
|  | ||||
| 		    if (!enc_utf8 || len == 0) | ||||
| 			break; | ||||
|  | ||||
| 		    for (i = 0; i < len; ++i) | ||||
| 			if (q[i] != rex.input[i]) | ||||
| 			{ | ||||
| 			    status = RA_NOMATCH; | ||||
| 			    break; | ||||
| 			} | ||||
| 		    rex.input += len; | ||||
| 		} | ||||
| 		break; | ||||
| 	    } | ||||
|  | ||||
| 	  case MULTIBYTECODE: | ||||
| 	    if (has_mbyte) | ||||
|  | ||||
							
								
								
									
										104
									
								
								src/regexp_nfa.c
									
									
									
									
									
								
							
							
						
						
									
										104
									
								
								src/regexp_nfa.c
									
									
									
									
									
								
							| @ -1764,6 +1764,7 @@ collection: | ||||
| 	    endp = skip_anyof(p); | ||||
| 	    if (*endp == ']') | ||||
| 	    { | ||||
| 		int plen; | ||||
| 		/* | ||||
| 		 * Try to reverse engineer character classes. For example, | ||||
| 		 * recognize that [0-9] stands for \d and [A-Za-z_] for \h, | ||||
| @ -2035,11 +2036,34 @@ collection: | ||||
| 			    if (got_coll_char == TRUE && startc == 0) | ||||
| 				EMIT(0x0a); | ||||
| 			    else | ||||
| 			    { | ||||
| 				EMIT(startc); | ||||
| 			    EMIT(NFA_CONCAT); | ||||
| 				if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))) | ||||
| 				{ | ||||
| 				    EMIT(NFA_CONCAT); | ||||
| 				} | ||||
| 			    } | ||||
| 			} | ||||
| 		    } | ||||
|  | ||||
| 		    if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))) | ||||
| 		    { | ||||
| 			int i = utf_ptr2len(regparse); | ||||
|  | ||||
| 			c = utf_ptr2char(regparse + i); | ||||
|  | ||||
| 			// Add composing characters | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 			    EMIT(c); | ||||
| 			    EMIT(NFA_CONCAT); | ||||
| 			    if ((i += utf_char2len(c)) >= plen) | ||||
| 				break; | ||||
| 			    c = utf_ptr2char(regparse + i); | ||||
| 			} | ||||
| 			EMIT(NFA_COMPOSING); | ||||
| 			EMIT(NFA_CONCAT); | ||||
| 		    } | ||||
| 		    MB_PTR_ADV(regparse); | ||||
| 		} // while (p < endp) | ||||
|  | ||||
| @ -6418,6 +6442,84 @@ nfa_regmatch( | ||||
| 		result_if_matched = (t->state->c == NFA_START_COLL); | ||||
| 		for (;;) | ||||
| 		{ | ||||
| 		    if (state->c == NFA_COMPOSING) | ||||
| 		    { | ||||
| 			int	    mc = curc; | ||||
| 			int	    len = 0; | ||||
| 			nfa_state_T *end; | ||||
| 			nfa_state_T *sta; | ||||
| 			int	    cchars[MAX_MCO]; | ||||
| 			int	    ccount = 0; | ||||
| 			int	    j; | ||||
|  | ||||
| 			sta = t->state->out->out; | ||||
| 			len = 0; | ||||
| 			if (utf_iscomposing(sta->c)) | ||||
| 			{ | ||||
| 			    // Only match composing character(s), ignore base | ||||
| 			    // character.  Used for ".{composing}" and "{composing}" | ||||
| 			    // (no preceding character). | ||||
| 			    len += mb_char2len(mc); | ||||
| 			} | ||||
| 			if (rex.reg_icombine && len == 0) | ||||
| 			{ | ||||
| 			    // If \Z was present, then ignore composing characters. | ||||
| 			    // When ignoring the base character this always matches. | ||||
| 			    if (sta->c != curc) | ||||
| 				result = FAIL; | ||||
| 			    else | ||||
| 				result = OK; | ||||
| 			    while (sta->c != NFA_END_COMPOSING) | ||||
| 				sta = sta->out; | ||||
| 			} | ||||
| 			// Check base character matches first, unless ignored. | ||||
| 			else if (len > 0 || mc == sta->c) | ||||
| //			if (len > 0 || mc == sta->c) | ||||
| 			{ | ||||
| 			    if (len == 0) | ||||
| 			    { | ||||
| 				len += mb_char2len(mc); | ||||
| 				sta = sta->out; | ||||
| 			    } | ||||
|  | ||||
| 			    // We don't care about the order of composing characters. | ||||
| 			    // Get them into cchars[] first. | ||||
| 			    while (len < clen) | ||||
| 			    { | ||||
| 				mc = mb_ptr2char(rex.input + len); | ||||
| 				cchars[ccount++] = mc; | ||||
| 				len += mb_char2len(mc); | ||||
| 				if (ccount == MAX_MCO) | ||||
| 				    break; | ||||
| 			    } | ||||
|  | ||||
| 			    // Check that each composing char in the pattern matches a | ||||
| 			    // composing char in the text.  We do not check if all | ||||
| 			    // composing chars are matched. | ||||
| 			    result = OK; | ||||
| 			    while (sta->c != NFA_END_COMPOSING) | ||||
| 			    { | ||||
| 				for (j = 0; j < ccount; ++j) | ||||
| 				    if (cchars[j] == sta->c) | ||||
| 					break; | ||||
| 				if (j == ccount) | ||||
| 				{ | ||||
| 				    result = FAIL; | ||||
| 				    break; | ||||
| 				} | ||||
| 				sta = sta->out; | ||||
| 			    } | ||||
| 			} | ||||
| 			else | ||||
| 			    result = FAIL; | ||||
|  | ||||
| 			if (t->state->out->out1->c == NFA_END_COMPOSING) | ||||
| 			{ | ||||
| 			    end = t->state->out->out1; | ||||
| 			    ADD_STATE_IF_MATCH(end); | ||||
| 			} | ||||
| 			break; | ||||
| 		    } | ||||
| 		    if (state->c == NFA_END_COLL) | ||||
| 		    { | ||||
| 			result = !result_if_matched; | ||||
|  | ||||
| @ -575,5 +575,16 @@ func Test_match_too_complicated() | ||||
|   set regexpengine=0 | ||||
| endfunc | ||||
|  | ||||
| func Test_combining_chars_in_collection() | ||||
|   new | ||||
|   for i in range(0,2) | ||||
|     exe "set re=".i | ||||
|     put =['ɔ̃', 'ɔ',  '̃  ã', 'abcd'] | ||||
|     :%s/[ɔ̃]// | ||||
|     call assert_equal(['', '', 'ɔ', '̃  ã', 'abcd'], getline(1,'$')) | ||||
|     %d | ||||
|   endfor | ||||
|   bw! | ||||
| endfunc | ||||
|  | ||||
| " vim: shiftwidth=2 sts=2 expandtab | ||||
|  | ||||
| @ -695,6 +695,8 @@ static char *(features[]) = | ||||
|  | ||||
| static int included_patches[] = | ||||
| {   /* Add new patch number below this line */ | ||||
| /**/ | ||||
|     1771, | ||||
| /**/ | ||||
|     1770, | ||||
| /**/ | ||||
|  | ||||
		Reference in New Issue
	
	Block a user