package org.aliabdelaziz.arabic.util; import java.net.URLEncoder; import java.io.UnsupportedEncodingException; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; import java.util.regex.Matcher; /** * Convenience methods for escaping special characters related to HTML, XML, * and regular expressions. * *

To keep you safe by default, WEB4J goes to some effort to escape * characters in your data when appropriate, such that you usually * don't need to think too much about escaping special characters. Thus, you * shouldn't need to directly use the services of this class very often. * *

For Model Objects containing free form user input, * it is highly recommended that you use {@link SafeText}, not String. * Free form user input is open to malicious use, such as * Cross Site Scripting * attacks. * Using SafeText will protect you from such attacks, by always escaping * special characters automatically in its toString() method. * *

The following WEB4J classes will automatically escape special characters * for you, when needed : *

* * @updatedBy * @author Ali Abdel-Aziz * @website http://www.aliabdelaziz.org/ */ public final class EscapeChars { /** used for matching in the passed text */ private static final Pattern SCRIPT_END = Pattern.compile( "", Pattern.CASE_INSENSITIVE ); /** ESCAPe charatcers & arabic letters HTML mapping */ public static Map CHARACTER_REPLACEMEN = new HashMap(); static { CHARACTER_REPLACEMEN.put(new Character('<'), "<"); CHARACTER_REPLACEMEN.put(new Character('>'), ">"); CHARACTER_REPLACEMEN.put(new Character('&'), "&"); CHARACTER_REPLACEMEN.put(new Character('\"'), """); CHARACTER_REPLACEMEN.put(new Character('\t'), " "); CHARACTER_REPLACEMEN.put(new Character('!'), "!"); CHARACTER_REPLACEMEN.put(new Character('#'), "#"); CHARACTER_REPLACEMEN.put(new Character('$'), "$"); CHARACTER_REPLACEMEN.put(new Character('%'), "%"); CHARACTER_REPLACEMEN.put(new Character('\''), "'"); CHARACTER_REPLACEMEN.put(new Character('('), "("); CHARACTER_REPLACEMEN.put(new Character(')'), ")"); CHARACTER_REPLACEMEN.put(new Character('*'), "*"); CHARACTER_REPLACEMEN.put(new Character('+'), "+"); CHARACTER_REPLACEMEN.put(new Character(','), ","); CHARACTER_REPLACEMEN.put(new Character('-'), "-"); CHARACTER_REPLACEMEN.put(new Character('.'), "."); // CHARACTER_REPLACEMEN.put(new Character('/'), "/"); CHARACTER_REPLACEMEN.put(new Character(':'), ":"); CHARACTER_REPLACEMEN.put(new Character(';'), ";"); CHARACTER_REPLACEMEN.put(new Character('='), "="); CHARACTER_REPLACEMEN.put(new Character('?'), "?"); CHARACTER_REPLACEMEN.put(new Character('@'), "@"); CHARACTER_REPLACEMEN.put(new Character('['), "["); CHARACTER_REPLACEMEN.put(new Character('\\'),"\"); CHARACTER_REPLACEMEN.put(new Character(']'), "]"); CHARACTER_REPLACEMEN.put(new Character('^'), "^"); CHARACTER_REPLACEMEN.put(new Character('_'), "_"); CHARACTER_REPLACEMEN.put(new Character('`'), "`"); CHARACTER_REPLACEMEN.put(new Character('{'), "{"); CHARACTER_REPLACEMEN.put(new Character('|'), "|"); CHARACTER_REPLACEMEN.put(new Character('}'), "}"); CHARACTER_REPLACEMEN.put(new Character('~'), "~"); // Arabic Letteres CHARACTER_REPLACEMEN.put(new Character('Ã'), "أ"); CHARACTER_REPLACEMEN.put(new Character('Ç'), "ا"); CHARACTER_REPLACEMEN.put(new Character('È'), "ب"); CHARACTER_REPLACEMEN.put(new Character('Ê'), "ت"); CHARACTER_REPLACEMEN.put(new Character('Ë'), "ث"); CHARACTER_REPLACEMEN.put(new Character('Ì'), "ج"); CHARACTER_REPLACEMEN.put(new Character('Í'), "ح"); CHARACTER_REPLACEMEN.put(new Character('Î'), "خ"); CHARACTER_REPLACEMEN.put(new Character('Ï'), "د"); CHARACTER_REPLACEMEN.put(new Character('Ð'), "ذ"); CHARACTER_REPLACEMEN.put(new Character('Ñ'), "ر"); CHARACTER_REPLACEMEN.put(new Character('Ò'), "ز"); CHARACTER_REPLACEMEN.put(new Character('Ó'), "س"); CHARACTER_REPLACEMEN.put(new Character('Ô'), "ش"); CHARACTER_REPLACEMEN.put(new Character('Õ'), "ص"); CHARACTER_REPLACEMEN.put(new Character('Ö'), "ض"); CHARACTER_REPLACEMEN.put(new Character('Ø'), "ط"); CHARACTER_REPLACEMEN.put(new Character('Ù'), "ظ"); CHARACTER_REPLACEMEN.put(new Character('Ú'), "ع"); CHARACTER_REPLACEMEN.put(new Character('Û'), "غ"); CHARACTER_REPLACEMEN.put(new Character('Ý'), "ف"); CHARACTER_REPLACEMEN.put(new Character('Þ'), "ق"); CHARACTER_REPLACEMEN.put(new Character('ß'), "ك"); CHARACTER_REPLACEMEN.put(new Character('á'), "ل"); CHARACTER_REPLACEMEN.put(new Character('ã'), "م"); CHARACTER_REPLACEMEN.put(new Character('ä'), "ن"); CHARACTER_REPLACEMEN.put(new Character('å'), "ه"); CHARACTER_REPLACEMEN.put(new Character('æ'), "و"); CHARACTER_REPLACEMEN.put(new Character('ì'), "ى"); CHARACTER_REPLACEMEN.put(new Character('í'), "ي"); } /** * Constructors */ /** PRIVATE EscapeChars Constructor */ private EscapeChars(){ //empty - prevent construction } /** * Escape characters for text appearing in HTML markup. *

This method exists as a defence against Cross Site Scripting (XSS) hacks. * The idea is to neutralize control characters commonly used by scripts, such that * they will not be executed by the browser. This is done by replacing the control * characters with their escaped equivalents. * See {@link hirondelle.web4j.security.SafeText} as well. * *

The following characters are replaced with corresponding * HTML character entities : * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
Character Replacement
< <
> >
& &
" "
\t
! !
# #
$ $
% %
' '
( (
) )
* *
+ +
, ,
- -
. .
/ /
: :
; ;
= =
? ?
@ @
[ [
\ \
] ]
^ ^
_ _
` `
{ {
| |
} }
~ ~
* *

Note that JSTL's {@code } escapes only the first * five of the above characters. */ public static String forHTML(String aText) { final StringBuffer result = new StringBuffer(); final StringCharacterIterator iterator = new StringCharacterIterator(aText); char character = iterator.current(); while (character != CharacterIterator.DONE ){ String replacement = (String) CHARACTER_REPLACEMEN.get(new Character(character)); if(replacement != null) { result.append(replacement); } else { //the char is not a special one //add it to the result as is result.append(character); } character = iterator.next(); } return result.toString(); } /** * Escape all ampersand characters in a URL. *

Replaces all '&' characters with '&'. * *

An ampersand character may appear in the query string of a URL. * The ampersand character is indeed valid in a URL. * However, URLs usually appear as an HREF attribute, and * such attributes have the additional constraint that ampersands * must be escaped. * *

The JSTL tag does indeed perform proper URL encoding of * query parameters. But it does not, in general, produce text which * is valid as an HREF attribute, simply because it does * not escape the ampersand character. This is a nuisance when * multiple query parameters appear in the URL, since it requires a little * extra work. */ public static String forHrefAmpersand(String aURL){ return aURL.replaceAll("&", "&"); } /** * Synonym for URLEncoder.encode(String, "UTF-8"). * *

Used to ensure that HTTP query strings are in proper form, by escaping * special characters such as spaces. * *

It is important to note that if a query string appears in an HREF * attribute, then there are two issues - ensuring the query string is valid HTTP * (it is URL-encoded), and ensuring it is valid HTML (ensuring the * ampersand is escaped). */ public static String forURL(String aURLFragment){ String result = null; try { result = URLEncoder.encode(aURLFragment, "UTF-8"); } catch (UnsupportedEncodingException ex){ throw new RuntimeException("UTF-8 not supported", ex); } return result; } /** * Escape characters for text appearing as XML data, between tags. * *

The following characters are replaced with corresponding character entities : * * * * * * * *
Character Encoding
< <
> >
& &
" "
' '
* *

Note that JSTL's {@code } escapes the exact same set of * characters as this method. That is, {@code } * is good for escaping to produce valid XML, but not for producing safe * HTML. */ public static String forXML(String aText) { final StringBuffer result = new StringBuffer(); final StringCharacterIterator iterator = new StringCharacterIterator(aText); char character = iterator.current(); while (character != CharacterIterator.DONE ){ String replacement = (String) CHARACTER_REPLACEMEN.get(new Character(character)); if (character == '<' || character == '>' || character == '\"' || character == '\'' || character == '&') { result.append(replacement); } else { //the char is not a special one //add it to the result as is result.append(character); } character = iterator.next(); } return result.toString(); } /** Escapes characters for text appearing as data in the Javascript Object Notation (JSON) data interchange format.

The following commonly used control characters are escaped :
Character Escaped As
" \"
\ \\
/ \/
back space \b
form feed \f
line feed \n
carriage return \r
tab \t

See RFC 4627 for more information. */ public static String forJSON(String aText){ final StringBuffer result = new StringBuffer(); StringCharacterIterator iterator = new StringCharacterIterator(aText); char character = iterator.current(); while (character != StringCharacterIterator.DONE){ if( character == '\"' ){ result.append("\\\""); } else if(character == '\\'){ result.append("\\\\"); } else if(character == '/'){ result.append("\\/"); } else if(character == '\b'){ result.append("\\b"); } else if(character == '\f'){ result.append("\\f"); } else if(character == '\n'){ result.append("\\n"); } else if(character == '\r'){ result.append("\\r"); } else if(character == '\t'){ result.append("\\t"); } else { //the char is not a special one //add it to the result as is result.append(character); } character = iterator.next(); } return result.toString(); } /** Return aText with all '<' and '>' characters replaced by their escaped equivalents. */ public static String toDisableTags(String aText){ final StringBuffer result = new StringBuffer(); final StringCharacterIterator iterator = new StringCharacterIterator(aText); char character = iterator.current(); while (character != CharacterIterator.DONE ){ if (character == '<') { result.append("<"); } else if (character == '>') { result.append(">"); } else { //the char is not a special one //add it to the result as is result.append(character); } character = iterator.next(); } return result.toString(); } /** Replace characters having special meaning in regular expressions with their escaped equivalents, preceded by a '\' character.

The escaped characters include :

*/ public static String forRegex(String aRegexFragment){ final StringBuffer result = new StringBuffer(); final StringCharacterIterator iterator = new StringCharacterIterator(aRegexFragment) ; char character = iterator.current(); while (character != CharacterIterator.DONE ){ /* All literals need to have backslashes doubled. */ if (character == '.') { result.append("\\."); } else if (character == '\\') { result.append("\\\\"); } else if (character == '?') { result.append("\\?"); } else if (character == '*') { result.append("\\*"); } else if (character == '+') { result.append("\\+"); } else if (character == '&') { result.append("\\&"); } else if (character == ':') { result.append("\\:"); } else if (character == '{') { result.append("\\{"); } else if (character == '}') { result.append("\\}"); } else if (character == '[') { result.append("\\["); } else if (character == ']') { result.append("\\]"); } else if (character == '(') { result.append("\\("); } else if (character == ')') { result.append("\\)"); } else if (character == '^') { result.append("\\^"); } else if (character == '$') { result.append("\\$"); } else { //the char is not a special one //add it to the result as is result.append(character); } character = iterator.next(); } return result.toString(); } /** Escape '$' and '\' characters in replacement strings.

Synonym for Matcher.quoteReplacement(String).

The following methods use replacement strings which treat '$' and '\' as special characters:

If replacement text can contain arbitrary characters, then you will usually need to escape that text, to ensure special characters are interpreted literally. */ public static String forReplacementString(String aInput){ return quoteReplacement(aInput); } /** * Returns a replacement string for the given one that has all backslashes * and dollar signs escaped. * * @param s the input string. * @return the input string, with all backslashes and dollar signs having * been escaped. */ public static String quoteReplacement(String s) { // first check whether we have smth to quote if (s.indexOf('\\') < 0 && s.indexOf('$') < 0) return s; StringBuffer res = new StringBuffer(s.length() * 2); char ch; int len = s.length(); for (int i = 0; i < len; i++) { switch (ch = s.charAt(i)) { case '$': res.append('\\'); res.append('$'); break; case '\\': res.append('\\'); res.append('\\'); break; default: res.append(ch); } } return res.toString(); } /** * Disable all