Object
Validate UTF-8 primarily in a Ruby environments other than 1.9.
Instances of this class are thread safe, and a single instance may be used safely by multiple concurrent threads, with one caveat:
The value of #{DEBUG} must not be changed by any thread.
For use during development only.
Validate the supplied string for proper UTF-8 encoding.
Calling Sequence:
validator = UTF8::Validator.new -> validator validator.valid_encoding?(string) -> true or false validator.valid_encoding?(string, true) -> true or exception
Parameters:
string | the string to validate |
raise_on_error | a flag to indicate failure behavior |
When raise_on_error is true and a string fails validation, an error of type #{UTF8::ValidationError} is raised. The byte in error and the location of that byte are described in the error message.
# File lib/validation/validator.rb, line 91 91: def valid_encoding?(string, raise_on_error = false) 92: bytes = string.bytes 93: # 94: valid = true 95: index = 1 96: nb_hex = nil 97: ni_hex = nil 98: state = "start" 99: next_byte_save = nil 100: # 101: bytes.each do |next_byte| 102: index += 1 103: next_byte_save = next_byte 104: ni_hex = sprintf "%x", index 105: nb_hex = sprintf "%x", next_byte 106: puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG 107: case state 108: 109: # State: 'start' 110: # The 'start' state: 111: # * handles all occurrences of valid single byte characters i.e., the ASCII character set 112: # * provides state transition logic for start bytes of valid characters with 2-4 bytes 113: # * signals a validation failure for all other single bytes 114: # 115: when "start" 116: puts "state: start" if DEBUG 117: case next_byte 118: 119: # ASCII 120: # * Input = 0x00-0x7F : change state to START 121: when (0x00..0x7f) 122: puts "state: start 1" if DEBUG 123: state = "start" 124: 125: # Start byte of two byte characters 126: # * Input = 0xC2-0xDF: change state to A 127: when (0xc2..0xdf) 128: puts "state: start 2" if DEBUG 129: state = "a" 130: 131: # Start byte of some three byte characters 132: # * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B 133: when (0xe1..0xec) 134: puts "state: start 3" if DEBUG 135: state = "b" 136: when (0xee..0xef) 137: puts "state: start 4" if DEBUG 138: state = "b" 139: 140: # Start byte of special three byte characters 141: # * Input = 0xE0: change state to C 142: when 0xe0 143: puts "state: start 5" if DEBUG 144: state = "c" 145: 146: # Start byte of the remaining three byte characters 147: # * Input = 0xED: change state to D 148: when 0xed 149: puts "state: start 6" if DEBUG 150: state = "d" 151: 152: # Start byte of some four byte characters 153: # * Input = 0xF1-0xF3:change state to E 154: when (0xf1..0xf3) 155: puts "state: start 7" if DEBUG 156: state = "e" 157: 158: # Start byte of special four byte characters 159: # * Input = 0xF0: change state to F 160: when 0xf0 161: puts "state: start 8" if DEBUG 162: state = "f" 163: 164: # Start byte of very special four byte characters 165: # * Input = 0xF4: change state to G 166: when 0xf4 167: puts "state: start 9" if DEBUG 168: state = "g" 169: 170: # All other single characters are invalid 171: # * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR 172: else 173: valid = false 174: break 175: end # of the inner case, the 'start' state 176: 177: # The last continuation byte of a 2, 3, or 4 byte character 178: # State: 'a' 179: # o Input = 0x80-0xBF: change state to START 180: # o Others: ERROR 181: when "a" 182: puts "state: a" if DEBUG 183: if (0x80..0xbf) === next_byte 184: state = "start" 185: else 186: valid = false 187: break 188: end 189: 190: # The first continuation byte for most 3 byte characters 191: # (those with start bytes in: 0xe1-0xec or 0xee-0xef) 192: # State: 'b' 193: # o Input = 0x80-0xBF: change state to A 194: # o Others: ERROR 195: when "b" 196: puts "state: b" if DEBUG 197: if (0x80..0xbf) === next_byte 198: state = "a" 199: else 200: valid = false 201: break 202: end 203: 204: # The first continuation byte for some special 3 byte characters 205: # (those with start byte 0xe0) 206: # State: 'c' 207: # o Input = 0xA0-0xBF: change state to A 208: # o Others: ERROR 209: when "c" 210: puts "state: c" if DEBUG 211: if (0xa0..0xbf) === next_byte 212: state = "a" 213: else 214: valid = false 215: break 216: end 217: 218: # The first continuation byte for the remaining 3 byte characters 219: # (those with start byte 0xed) 220: # State: 'd' 221: # o Input = 0x80-0x9F: change state to A 222: # o Others: ERROR 223: when "d" 224: puts "state: d" if DEBUG 225: if (0x80..0x9f) === next_byte 226: state = "a" 227: else 228: valid = false 229: break 230: end 231: 232: # The first continuation byte for some 4 byte characters 233: # (those with start bytes in: 0xf1-0xf3) 234: # State: 'e' 235: # o Input = 0x80-0xBF: change state to B 236: # o Others: ERROR 237: when "e" 238: puts "state: e" if DEBUG 239: if (0x80..0xbf) === next_byte 240: state = "b" 241: else 242: valid = false 243: break 244: end 245: 246: # The first continuation byte for some special 4 byte characters 247: # (those with start byte 0xf0) 248: # State: 'f' 249: # o Input = 0x90-0xBF: change state to B 250: # o Others: ERROR 251: when "f" 252: puts "state: f" if DEBUG 253: if (0x90..0xbf) === next_byte 254: state = "b" 255: else 256: valid = false 257: break 258: end 259: 260: # The first continuation byte for the remaining 4 byte characters 261: # (those with start byte 0xf4) 262: # State: 'g' 263: # o Input = 0x80-0x8F: change state to B 264: # o Others: ERROR 265: when "g" 266: puts "state: g" if DEBUG 267: if (0x80..0x8f) === next_byte 268: state = "b" 269: else 270: valid = false 271: break 272: end 273: 274: # 275: else 276: raise RuntimeError, "state: default" 277: end 278: end 279: # 280: puts "State at end: #{state}" if DEBUG 281: # Catch truncation at end of string 282: if valid and state != 'start' 283: puts "Resetting valid value" if DEBUG 284: valid = false 285: end 286: # 287: if !valid and raise_on_error 288: puts "Raising Error" if DEBUG 289: raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})" 290: end 291: # 292: valid 293: end
Disabled; run with --debug to generate this.
Generated with the Darkfish Rdoc Generator 1.1.6.