001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2017 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 028import com.puppycrawl.tools.checkstyle.api.DetailAST; 029import com.puppycrawl.tools.checkstyle.api.TextBlock; 030import com.puppycrawl.tools.checkstyle.api.TokenTypes; 031import com.puppycrawl.tools.checkstyle.utils.CommonUtils; 032 033/** 034 * <p> 035 * Restrict using <a href = 036 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> 037 * Unicode escapes</a> (such as <code>\u221e</code>). 038 * It is possible to allow using escapes for 039 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 040 * non-printable(control) characters</a>. 041 * Also, this check can be configured to allow using escapes 042 * if trail comment is present. By the option it is possible to 043 * allow using escapes if literal contains only them. By the option it 044 * is possible to allow using escapes for space literals. 045 * </p> 046 * <p> 047 * Examples of using Unicode:</p> 048 * <pre> 049 * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. 050 * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. 051 * </pre> 052 * <p> 053 * An example of how to configure the check is: 054 * </p> 055 * <pre> 056 * <module name="AvoidEscapedUnicodeCharacters"/> 057 * </pre> 058 * <p> 059 * An example of non-printable(control) characters. 060 * </p> 061 * <pre> 062 * return '\ufeff' + content; // byte order mark 063 * </pre> 064 * <p> 065 * An example of how to configure the check to allow using escapes 066 * for non-printable(control) characters: 067 * </p> 068 * <pre> 069 * <module name="AvoidEscapedUnicodeCharacters"> 070 * <property name="allowEscapesForControlCharacters" value="true"/> 071 * </module> 072 * </pre> 073 * <p> 074 * Example of using escapes with trail comment: 075 * </p> 076 * <pre> 077 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 078 * </pre> 079 * <p>An example of how to configure the check to allow using escapes 080 * if trail comment is present: 081 * </p> 082 * <pre> 083 * <module name="AvoidEscapedUnicodeCharacters"> 084 * <property name="allowByTailComment" value="true"/> 085 * </module> 086 * </pre> 087 * <p>Example of using escapes if literal contains only them: 088 * </p> 089 * <pre> 090 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 091 * </pre> 092 * <p>An example of how to configure the check to allow escapes 093 * if literal contains only them: 094 * </p> 095 * <pre> 096 * <module name="AvoidEscapedUnicodeCharacters"> 097 * <property name="allowIfAllCharactersEscaped" value="true"/> 098 * </module> 099 * </pre> 100 * <p>An example of how to configure the check to allow non-printable escapes: 101 * </p> 102 * <pre> 103 * <module name="AvoidEscapedUnicodeCharacters"> 104 * <property name="allowNonPrintableEscapes" value="true"/> 105 * </module> 106 * </pre> 107 * 108 * @author maxvetrenko 109 * 110 */ 111public class AvoidEscapedUnicodeCharactersCheck 112 extends AbstractCheck { 113 /** 114 * A key is pointing to the warning message text in "messages.properties" 115 * file. 116 */ 117 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 118 119 /** Regular expression for Unicode chars. */ 120 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 121 122 /** 123 * Regular expression Unicode control characters. 124 * 125 * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 126 * Appendix:Control characters</a> 127 */ 128 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)" 129 + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)" 130 + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]" 131 + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); 132 133 /** Regular expression for all escaped chars. */ 134 private static final Pattern ALL_ESCAPED_CHARS = 135 Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 136 + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$"); 137 138 /** Regular expression for escaped backslash. */ 139 private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); 140 141 /** Regular expression for non-printable unicode chars. */ 142 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028" 143 + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" 144 + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" 145 + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" 146 + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" 147 + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" 148 + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" 149 + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" 150 + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" 151 + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" 152 + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" 153 + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" 154 + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" 155 + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" 156 + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" 157 + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" 158 + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" 159 + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" 160 + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" 161 + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); 162 163 /** Cpp style comments. */ 164 private Map<Integer, TextBlock> singlelineComments; 165 /** C style comments. */ 166 private Map<Integer, List<TextBlock>> blockComments; 167 168 /** Allow use escapes for non-printable(control) characters. */ 169 private boolean allowEscapesForControlCharacters; 170 171 /** Allow use escapes if trail comment is present. */ 172 private boolean allowByTailComment; 173 174 /** Allow if all characters in literal are escaped. */ 175 private boolean allowIfAllCharactersEscaped; 176 177 /** Allow escapes for space literals. */ 178 private boolean allowNonPrintableEscapes; 179 180 /** 181 * Set allowIfAllCharactersEscaped. 182 * @param allow user's value. 183 */ 184 public final void setAllowEscapesForControlCharacters(boolean allow) { 185 allowEscapesForControlCharacters = allow; 186 } 187 188 /** 189 * Set allowByTailComment. 190 * @param allow user's value. 191 */ 192 public final void setAllowByTailComment(boolean allow) { 193 allowByTailComment = allow; 194 } 195 196 /** 197 * Set allowIfAllCharactersEscaped. 198 * @param allow user's value. 199 */ 200 public final void setAllowIfAllCharactersEscaped(boolean allow) { 201 allowIfAllCharactersEscaped = allow; 202 } 203 204 /** 205 * Set allowSpaceEscapes. 206 * @param allow user's value. 207 */ 208 public final void setAllowNonPrintableEscapes(boolean allow) { 209 allowNonPrintableEscapes = allow; 210 } 211 212 @Override 213 public int[] getDefaultTokens() { 214 return getAcceptableTokens(); 215 } 216 217 @Override 218 public int[] getAcceptableTokens() { 219 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 220 } 221 222 @Override 223 public int[] getRequiredTokens() { 224 return getAcceptableTokens(); 225 } 226 227 @Override 228 public void beginTree(DetailAST rootAST) { 229 singlelineComments = getFileContents().getSingleLineComments(); 230 blockComments = getFileContents().getBlockComments(); 231 } 232 233 @Override 234 public void visitToken(DetailAST ast) { 235 236 final String literal = ast.getText(); 237 238 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 239 || isAllCharactersEscaped(literal) 240 || allowEscapesForControlCharacters 241 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 242 || allowNonPrintableEscapes 243 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 244 log(ast.getLineNo(), MSG_KEY); 245 } 246 } 247 248 /** 249 * Checks if literal has Unicode chars. 250 * @param literal String literal. 251 * @return true if literal has Unicode chars. 252 */ 253 private static boolean hasUnicodeChar(String literal) { 254 final String literalWithoutEscapedBackslashes = 255 ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); 256 return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); 257 } 258 259 /** 260 * Check if String literal contains Unicode control chars. 261 * @param literal String literal. 262 * @param pattern RegExp for valid characters. 263 * @return true, if String literal contains Unicode control chars. 264 */ 265 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 266 final int unicodeMatchesCounter = 267 countMatches(UNICODE_REGEXP, literal); 268 final int unicodeValidMatchesCounter = 269 countMatches(pattern, literal); 270 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 271 } 272 273 /** 274 * Check if trail comment is present after ast token. 275 * @param ast current token. 276 * @return true if trail comment is present after ast token. 277 */ 278 private boolean hasTrailComment(DetailAST ast) { 279 boolean result = false; 280 final int lineNo = ast.getLineNo(); 281 if (singlelineComments.containsKey(lineNo)) { 282 result = true; 283 } 284 else { 285 final List<TextBlock> commentList = blockComments.get(lineNo); 286 if (commentList != null) { 287 final TextBlock comment = commentList.get(commentList.size() - 1); 288 final String line = getLines()[lineNo - 1]; 289 result = isTrailingBlockComment(comment, line); 290 } 291 } 292 return result; 293 } 294 295 /** 296 * Whether the C style comment is trailing. 297 * @param comment the comment to check. 298 * @param line the line where the comment starts. 299 * @return true if the comment is trailing. 300 */ 301 private static boolean isTrailingBlockComment(TextBlock comment, String line) { 302 return comment.getText().length != 1 303 || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1)); 304 } 305 306 /** 307 * Count regexp matches into String literal. 308 * @param pattern pattern. 309 * @param target String literal. 310 * @return count of regexp matches. 311 */ 312 private static int countMatches(Pattern pattern, String target) { 313 int matcherCounter = 0; 314 final Matcher matcher = pattern.matcher(target); 315 while (matcher.find()) { 316 matcherCounter++; 317 } 318 return matcherCounter; 319 } 320 321 /** 322 * Checks if all characters in String literal is escaped. 323 * @param literal current literal. 324 * @return true if all characters in String literal is escaped. 325 */ 326 private boolean isAllCharactersEscaped(String literal) { 327 return allowIfAllCharactersEscaped 328 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 329 literal.length() - 1)).find(); 330 } 331}