Object
Validate UTF-8 primarily in a Ruby environments other than 1.9.
Instances of this class are thread safe, and a single instance may be used safely by multiple concurrent threads, with one caveat:
The value of #{DEBUG} must not be changed by any thread.
For use during development only.
Validate the supplied string for proper UTF-8 encoding.
Calling Sequence:
validator = UTF8::Validator.new -> validator validator.valid_encoding?(string) -> true or false validator.valid_encoding?(string, true) -> true or exception
Parameters:
| string | the string to validate |
| raise_on_error | a flag to indicate failure behavior |
When raise_on_error is true and a string fails validation, an error of type #{UTF8::ValidationError} is raised. The byte in error and the location of that byte are described in the error message.
# File lib/validation/validator.rb, line 91
91: def valid_encoding?(string, raise_on_error = false)
92: bytes = string.bytes
93: #
94: valid = true
95: index = 1
96: nb_hex = nil
97: ni_hex = nil
98: state = "start"
99: next_byte_save = nil
100: #
101: bytes.each do |next_byte|
102: index += 1
103: next_byte_save = next_byte
104: ni_hex = sprintf "%x", index
105: nb_hex = sprintf "%x", next_byte
106: puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG
107: case state
108:
109: # State: 'start'
110: # The 'start' state:
111: # * handles all occurrences of valid single byte characters i.e., the ASCII character set
112: # * provides state transition logic for start bytes of valid characters with 2-4 bytes
113: # * signals a validation failure for all other single bytes
114: #
115: when "start"
116: puts "state: start" if DEBUG
117: case next_byte
118:
119: # ASCII
120: # * Input = 0x00-0x7F : change state to START
121: when (0x00..0x7f)
122: puts "state: start 1" if DEBUG
123: state = "start"
124:
125: # Start byte of two byte characters
126: # * Input = 0xC2-0xDF: change state to A
127: when (0xc2..0xdf)
128: puts "state: start 2" if DEBUG
129: state = "a"
130:
131: # Start byte of some three byte characters
132: # * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
133: when (0xe1..0xec)
134: puts "state: start 3" if DEBUG
135: state = "b"
136: when (0xee..0xef)
137: puts "state: start 4" if DEBUG
138: state = "b"
139:
140: # Start byte of special three byte characters
141: # * Input = 0xE0: change state to C
142: when 0xe0
143: puts "state: start 5" if DEBUG
144: state = "c"
145:
146: # Start byte of the remaining three byte characters
147: # * Input = 0xED: change state to D
148: when 0xed
149: puts "state: start 6" if DEBUG
150: state = "d"
151:
152: # Start byte of some four byte characters
153: # * Input = 0xF1-0xF3:change state to E
154: when (0xf1..0xf3)
155: puts "state: start 7" if DEBUG
156: state = "e"
157:
158: # Start byte of special four byte characters
159: # * Input = 0xF0: change state to F
160: when 0xf0
161: puts "state: start 8" if DEBUG
162: state = "f"
163:
164: # Start byte of very special four byte characters
165: # * Input = 0xF4: change state to G
166: when 0xf4
167: puts "state: start 9" if DEBUG
168: state = "g"
169:
170: # All other single characters are invalid
171: # * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
172: else
173: valid = false
174: break
175: end # of the inner case, the 'start' state
176:
177: # The last continuation byte of a 2, 3, or 4 byte character
178: # State: 'a'
179: # o Input = 0x80-0xBF: change state to START
180: # o Others: ERROR
181: when "a"
182: puts "state: a" if DEBUG
183: if (0x80..0xbf) === next_byte
184: state = "start"
185: else
186: valid = false
187: break
188: end
189:
190: # The first continuation byte for most 3 byte characters
191: # (those with start bytes in: 0xe1-0xec or 0xee-0xef)
192: # State: 'b'
193: # o Input = 0x80-0xBF: change state to A
194: # o Others: ERROR
195: when "b"
196: puts "state: b" if DEBUG
197: if (0x80..0xbf) === next_byte
198: state = "a"
199: else
200: valid = false
201: break
202: end
203:
204: # The first continuation byte for some special 3 byte characters
205: # (those with start byte 0xe0)
206: # State: 'c'
207: # o Input = 0xA0-0xBF: change state to A
208: # o Others: ERROR
209: when "c"
210: puts "state: c" if DEBUG
211: if (0xa0..0xbf) === next_byte
212: state = "a"
213: else
214: valid = false
215: break
216: end
217:
218: # The first continuation byte for the remaining 3 byte characters
219: # (those with start byte 0xed)
220: # State: 'd'
221: # o Input = 0x80-0x9F: change state to A
222: # o Others: ERROR
223: when "d"
224: puts "state: d" if DEBUG
225: if (0x80..0x9f) === next_byte
226: state = "a"
227: else
228: valid = false
229: break
230: end
231:
232: # The first continuation byte for some 4 byte characters
233: # (those with start bytes in: 0xf1-0xf3)
234: # State: 'e'
235: # o Input = 0x80-0xBF: change state to B
236: # o Others: ERROR
237: when "e"
238: puts "state: e" if DEBUG
239: if (0x80..0xbf) === next_byte
240: state = "b"
241: else
242: valid = false
243: break
244: end
245:
246: # The first continuation byte for some special 4 byte characters
247: # (those with start byte 0xf0)
248: # State: 'f'
249: # o Input = 0x90-0xBF: change state to B
250: # o Others: ERROR
251: when "f"
252: puts "state: f" if DEBUG
253: if (0x90..0xbf) === next_byte
254: state = "b"
255: else
256: valid = false
257: break
258: end
259:
260: # The first continuation byte for the remaining 4 byte characters
261: # (those with start byte 0xf4)
262: # State: 'g'
263: # o Input = 0x80-0x8F: change state to B
264: # o Others: ERROR
265: when "g"
266: puts "state: g" if DEBUG
267: if (0x80..0x8f) === next_byte
268: state = "b"
269: else
270: valid = false
271: break
272: end
273:
274: #
275: else
276: raise RuntimeError, "state: default"
277: end
278: end
279: #
280: puts "State at end: #{state}" if DEBUG
281: # Catch truncation at end of string
282: if valid and state != 'start'
283: puts "Resetting valid value" if DEBUG
284: valid = false
285: end
286: #
287: if !valid and raise_on_error
288: puts "Raising Error" if DEBUG
289: raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
290: end
291: #
292: valid
293: end
Disabled; run with --debug to generate this.
Generated with the Darkfish Rdoc Generator 1.1.6.