Parent

Files

Class Index [+]

Quicksearch

UTF8::Validator

Purpose

Validate UTF-8 primarily in a Ruby environments other than 1.9.

Instances of this class are thread safe, and a single instance may be used safely by multiple concurrent threads, with one caveat:

The value of #{DEBUG} must not be changed by any thread.

Constants

DEBUG

For use during development only.

Public Instance Methods

valid_encoding?(string, raise_on_error = false) click to toggle source

Validate the supplied string for proper UTF-8 encoding.

Calling Sequence:

   validator = UTF8::Validator.new                           -> validator
   validator.valid_encoding?(string)                         -> true or false
   validator.valid_encoding?(string, true)                   -> true or exception

Parameters:

string

the string to validate

raise_on_error

a flag to indicate failure behavior

When raise_on_error is true and a string fails validation, an error of type #{UTF8::ValidationError} is raised. The byte in error and the location of that byte are described in the error message.

     # File lib/validation/validator.rb, line 91
 91:   def valid_encoding?(string, raise_on_error = false)
 92:     bytes = string.bytes
 93:     #
 94:     valid = true
 95:     index = 1
 96:     nb_hex = nil
 97:     ni_hex = nil
 98:     state = "start"
 99:     next_byte_save = nil
100:     #
101:     bytes.each do |next_byte|
102:       index += 1
103:       next_byte_save = next_byte
104:       ni_hex = sprintf "%x", index
105:       nb_hex = sprintf "%x", next_byte
106:       puts "Top: #{next_byte}(0x#{nb_hex}), index: #{index}(0x#{ni_hex})" if DEBUG
107:       case state
108: 
109:         # State: 'start'
110:         # The 'start' state:
111:         # * handles all occurrences of valid single byte characters i.e., the ASCII character set
112:         # * provides state transition logic for start bytes of valid characters with 2-4 bytes
113:         # * signals a validation failure for all other single bytes
114:         # 
115:         when "start"
116:           puts "state: start" if DEBUG
117:           case next_byte
118: 
119:             # ASCII
120:             # * Input = 0x00-0x7F : change state to START
121:             when (0x00..0x7f)
122:               puts "state: start 1" if DEBUG
123:               state = "start"
124: 
125:             # Start byte of two byte characters
126:             # * Input = 0xC2-0xDF: change state to A
127:             when (0xc2..0xdf)
128:               puts "state: start 2" if DEBUG
129:               state = "a"
130: 
131:             # Start byte of some three byte characters
132:             # * Input = 0xE1-0xEC, 0xEE-0xEF: change state to B
133:             when (0xe1..0xec)
134:               puts "state: start 3" if DEBUG
135:               state = "b"
136:             when (0xee..0xef)
137:               puts "state: start 4" if DEBUG
138:               state = "b"
139: 
140:             # Start byte of special three byte characters
141:             # * Input = 0xE0: change state to C
142:             when 0xe0
143:               puts "state: start 5" if DEBUG
144:               state = "c"
145: 
146:             # Start byte of the remaining three byte characters
147:             # * Input = 0xED: change state to D
148:             when 0xed
149:               puts "state: start 6" if DEBUG
150:               state = "d"
151: 
152:             # Start byte of some four byte characters
153:             # * Input = 0xF1-0xF3:change state to E
154:             when (0xf1..0xf3)
155:               puts "state: start 7" if DEBUG
156:               state = "e"
157: 
158:             # Start byte of special four byte characters
159:             # * Input = 0xF0: change state to F
160:             when 0xf0
161:               puts "state: start 8" if DEBUG
162:               state = "f"
163: 
164:             # Start byte of very special four byte characters
165:             # * Input = 0xF4: change state to G
166:             when 0xf4
167:               puts "state: start 9" if DEBUG
168:               state = "g"
169: 
170:             # All other single characters are invalid
171:             # * Input = Others (0x80-0xBF,0xC0-0xC1, 0xF5-0xFF): ERROR
172:             else
173:               valid = false
174:               break
175:           end # of the inner case, the 'start' state
176: 
177:         # The last continuation byte of a 2, 3, or 4 byte character
178:         # State: 'a'
179:         #  o Input = 0x80-0xBF: change state to START
180:         #  o Others: ERROR
181:         when "a"
182:           puts "state: a" if DEBUG
183:           if (0x80..0xbf) === next_byte
184:             state = "start"
185:           else
186:             valid = false
187:             break
188:           end
189: 
190:         # The first continuation byte for most 3 byte characters
191:         # (those with start bytes in: 0xe1-0xec or 0xee-0xef)
192:         # State: 'b'
193:         # o Input = 0x80-0xBF: change state to A
194:         # o Others: ERROR
195:         when "b"
196:           puts "state: b" if DEBUG
197:           if (0x80..0xbf) === next_byte
198:             state = "a"
199:           else
200:             valid = false
201:             break
202:           end
203: 
204:         # The first continuation byte for some special 3 byte characters
205:         # (those with start byte 0xe0)
206:         # State: 'c'
207:         # o Input = 0xA0-0xBF: change state to A
208:         # o Others: ERROR
209:         when "c"
210:           puts "state: c" if DEBUG
211:           if (0xa0..0xbf) === next_byte
212:             state = "a"
213:           else
214:             valid = false
215:             break
216:           end
217: 
218:         # The first continuation byte for the remaining 3 byte characters
219:         # (those with start byte 0xed)
220:         # State: 'd'
221:         # o Input = 0x80-0x9F: change state to A
222:         # o Others: ERROR
223:         when "d"
224:           puts "state: d" if DEBUG
225:           if (0x80..0x9f) === next_byte
226:             state = "a"
227:           else
228:             valid = false
229:             break
230:           end
231: 
232:         # The first continuation byte for some 4 byte characters
233:         # (those with start bytes in: 0xf1-0xf3)
234:         # State: 'e'
235:         # o Input = 0x80-0xBF: change state to B
236:         # o Others: ERROR
237:         when "e"
238:           puts "state: e" if DEBUG
239:           if (0x80..0xbf) === next_byte
240:             state = "b"
241:           else
242:             valid = false
243:             break
244:           end
245: 
246:         # The first continuation byte for some special 4 byte characters
247:         # (those with start byte 0xf0)
248:         # State: 'f'
249:         # o Input = 0x90-0xBF: change state to B
250:         # o Others: ERROR
251:         when "f"
252:           puts "state: f" if DEBUG
253:           if (0x90..0xbf) === next_byte
254:             state = "b"
255:           else
256:             valid = false
257:             break
258:           end
259: 
260:         # The first continuation byte for the remaining 4 byte characters
261:         # (those with start byte 0xf4)
262:         # State: 'g'
263:         # o Input = 0x80-0x8F: change state to B
264:         # o Others: ERROR
265:         when "g"
266:           puts "state: g" if DEBUG
267:           if (0x80..0x8f) === next_byte
268:             state = "b"
269:           else
270:             valid = false
271:             break
272:           end
273: 
274:         #
275:         else
276:           raise RuntimeError, "state: default"
277:       end
278:     end
279:     #
280:     puts "State at end: #{state}" if DEBUG
281:     # Catch truncation at end of string
282:     if valid and state != 'start'
283:       puts "Resetting valid value" if DEBUG
284:       valid = false
285:     end
286:     #
287:     if !valid and raise_on_error
288:       puts "Raising Error" if DEBUG
289:       raise ValidationError, "Invalid byte:#{next_byte_save}(0x#{nb_hex}),index:#{index}(0x#{ni_hex})"
290:     end
291:     #
292:     valid
293:   end

Disabled; run with --debug to generate this.

[Validate]

Generated with the Darkfish Rdoc Generator 1.1.6.