001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2017 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
028import com.puppycrawl.tools.checkstyle.api.DetailAST;
029import com.puppycrawl.tools.checkstyle.api.TextBlock;
030import com.puppycrawl.tools.checkstyle.api.TokenTypes;
031import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
032
033/**
034 * <p>
035 * Restrict using <a href =
036 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
037 * Unicode escapes</a> (such as <code>&#92;u221e</code>).
038 * It is possible to allow using escapes for
039 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
040 * non-printable(control) characters</a>.
041 * Also, this check can be configured to allow using escapes
042 * if trail comment is present. By the option it is possible to
043 * allow using escapes if literal contains only them. By the option it
044 * is possible to allow using escapes for space literals.
045 * </p>
046 * <p>
047 * Examples of using Unicode:</p>
048 * <pre>
049 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
050 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
051 * </pre>
052 * <p>
053 * An example of how to configure the check is:
054 * </p>
055 * <pre>
056 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
057 * </pre>
058 * <p>
059 * An example of non-printable(control) characters.
060 * </p>
061 * <pre>
062 * return '&#92;ufeff' + content; // byte order mark
063 * </pre>
064 * <p>
065 * An example of how to configure the check to allow using escapes
066 * for non-printable(control) characters:
067 * </p>
068 * <pre>
069 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
070 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
071 * &lt;/module&gt;
072 * </pre>
073 * <p>
074 * Example of using escapes with trail comment:
075 * </p>
076 * <pre>
077 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
078 * </pre>
079 * <p>An example of how to configure the check to allow using escapes
080 * if trail comment is present:
081 * </p>
082 * <pre>
083 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
084 *     &lt;property name="allowByTailComment" value="true"/&gt;
085 * &lt;/module&gt;
086 * </pre>
087 * <p>Example of using escapes if literal contains only them:
088 * </p>
089 * <pre>
090 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
091 * </pre>
092 * <p>An example of how to configure the check to allow escapes
093 * if literal contains only them:
094 * </p>
095 * <pre>
096 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
097 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
098 * &lt;/module&gt;
099 * </pre>
100 * <p>An example of how to configure the check to allow non-printable escapes:
101 * </p>
102 * <pre>
103 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
104 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
105 * &lt;/module&gt;
106 * </pre>
107 *
108 * @author maxvetrenko
109 *
110 */
111public class AvoidEscapedUnicodeCharactersCheck
112    extends AbstractCheck {
113    /**
114     * A key is pointing to the warning message text in "messages.properties"
115     * file.
116     */
117    public static final String MSG_KEY = "forbid.escaped.unicode.char";
118
119    /** Regular expression for Unicode chars. */
120    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
121
122    /**
123     * Regular expression Unicode control characters.
124     *
125     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
126     *     Appendix:Control characters</a>
127     */
128    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)"
129            + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)"
130            + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]"
131            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
132
133    /** Regular expression for all escaped chars. */
134    private static final Pattern ALL_ESCAPED_CHARS =
135            Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
136                    + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
137
138    /** Regular expression for escaped backslash. */
139    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
140
141    /** Regular expression for non-printable unicode chars. */
142    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
143            + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)"
144            + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)"
145            + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)"
146            + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
147            + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9"
148            + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604"
149            + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)"
150            + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)"
151            + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)"
152            + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00"
153            + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9"
154            + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}"
155            + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000"
156            + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)"
157            + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)"
158            + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006"
159            + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028"
160            + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025"
161            + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61");
162
163    /** Cpp style comments. */
164    private Map<Integer, TextBlock> singlelineComments;
165    /** C style comments. */
166    private Map<Integer, List<TextBlock>> blockComments;
167
168    /** Allow use escapes for non-printable(control) characters.  */
169    private boolean allowEscapesForControlCharacters;
170
171    /** Allow use escapes if trail comment is present. */
172    private boolean allowByTailComment;
173
174    /** Allow if all characters in literal are escaped. */
175    private boolean allowIfAllCharactersEscaped;
176
177    /** Allow escapes for space literals. */
178    private boolean allowNonPrintableEscapes;
179
180    /**
181     * Set allowIfAllCharactersEscaped.
182     * @param allow user's value.
183     */
184    public final void setAllowEscapesForControlCharacters(boolean allow) {
185        allowEscapesForControlCharacters = allow;
186    }
187
188    /**
189     * Set allowByTailComment.
190     * @param allow user's value.
191     */
192    public final void setAllowByTailComment(boolean allow) {
193        allowByTailComment = allow;
194    }
195
196    /**
197     * Set allowIfAllCharactersEscaped.
198     * @param allow user's value.
199     */
200    public final void setAllowIfAllCharactersEscaped(boolean allow) {
201        allowIfAllCharactersEscaped = allow;
202    }
203
204    /**
205     * Set allowSpaceEscapes.
206     * @param allow user's value.
207     */
208    public final void setAllowNonPrintableEscapes(boolean allow) {
209        allowNonPrintableEscapes = allow;
210    }
211
212    @Override
213    public int[] getDefaultTokens() {
214        return getAcceptableTokens();
215    }
216
217    @Override
218    public int[] getAcceptableTokens() {
219        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
220    }
221
222    @Override
223    public int[] getRequiredTokens() {
224        return getAcceptableTokens();
225    }
226
227    @Override
228    public void beginTree(DetailAST rootAST) {
229        singlelineComments = getFileContents().getSingleLineComments();
230        blockComments = getFileContents().getBlockComments();
231    }
232
233    @Override
234    public void visitToken(DetailAST ast) {
235
236        final String literal = ast.getText();
237
238        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
239                || isAllCharactersEscaped(literal)
240                || allowEscapesForControlCharacters
241                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
242                || allowNonPrintableEscapes
243                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
244            log(ast.getLineNo(), MSG_KEY);
245        }
246    }
247
248    /**
249     * Checks if literal has Unicode chars.
250     * @param literal String literal.
251     * @return true if literal has Unicode chars.
252     */
253    private static boolean hasUnicodeChar(String literal) {
254        final String literalWithoutEscapedBackslashes =
255                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
256        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
257    }
258
259    /**
260     * Check if String literal contains Unicode control chars.
261     * @param literal String literal.
262     * @param pattern RegExp for valid characters.
263     * @return true, if String literal contains Unicode control chars.
264     */
265    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
266        final int unicodeMatchesCounter =
267                countMatches(UNICODE_REGEXP, literal);
268        final int unicodeValidMatchesCounter =
269                countMatches(pattern, literal);
270        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
271    }
272
273    /**
274     * Check if trail comment is present after ast token.
275     * @param ast current token.
276     * @return true if trail comment is present after ast token.
277     */
278    private boolean hasTrailComment(DetailAST ast) {
279        boolean result = false;
280        final int lineNo = ast.getLineNo();
281        if (singlelineComments.containsKey(lineNo)) {
282            result = true;
283        }
284        else {
285            final List<TextBlock> commentList = blockComments.get(lineNo);
286            if (commentList != null) {
287                final TextBlock comment = commentList.get(commentList.size() - 1);
288                final String line = getLines()[lineNo - 1];
289                result = isTrailingBlockComment(comment, line);
290            }
291        }
292        return result;
293    }
294
295    /**
296     * Whether the C style comment is trailing.
297     * @param comment the comment to check.
298     * @param line the line where the comment starts.
299     * @return true if the comment is trailing.
300     */
301    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
302        return comment.getText().length != 1
303            || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
304    }
305
306    /**
307     * Count regexp matches into String literal.
308     * @param pattern pattern.
309     * @param target String literal.
310     * @return count of regexp matches.
311     */
312    private static int countMatches(Pattern pattern, String target) {
313        int matcherCounter = 0;
314        final Matcher matcher = pattern.matcher(target);
315        while (matcher.find()) {
316            matcherCounter++;
317        }
318        return matcherCounter;
319    }
320
321    /**
322     * Checks if all characters in String literal is escaped.
323     * @param literal current literal.
324     * @return true if all characters in String literal is escaped.
325     */
326    private boolean isAllCharactersEscaped(String literal) {
327        return allowIfAllCharactersEscaped
328                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
329                        literal.length() - 1)).find();
330    }
331}