/* * call-seq: * Token.new(text, start, end, pos_inc = 1) -> new Token * * Creates a new token setting the text, start and end offsets of the token * and the position increment for the token. * * The position increment is usually set to 1 but you can set it to other * values as needed. For example, if you have a stop word filter you will be * skipping tokens. Let's say you have the stop words "the" and "and" and you * parse the title "The Old Man and the Sea". The terms "Old", "Man" and * "Sea" will have the position incerements 2, 1 and 3 respectively. * * Another reason you might want to vary the position increment is if you are * adding synonyms to the index. For example let's say you have the synonym * group "quick", "fast" and "speedy". When tokenizing the phrase "Next day * speedy delivery", you'll add "speedy" first with a position increment of 1 * and then "fast" and "quick" with position increments of 0 since they are * represented in the same position. * * The offset set values +start+ and +end+ should be byte offsets, not * character offsets. This makes it easy to use those offsets to quickly * access the token in the input string and also to insert highlighting tags * when necessary. * * text:: the main text for the token. * start:: the start offset of the token in bytes. * end:: the end offset of the token in bytes. * pos_inc:: the position increment of a token. See above. * return:: a newly created and assigned Token object */ static VALUE frt_token_init(int argc, VALUE *argv, VALUE self) { RToken *token; VALUE rtext, rstart, rend, rpos_inc, rtype; GET_TK(token, self); token->pos_inc = 1; switch (rb_scan_args(argc, argv, "32", &rtext, &rstart, &rend, &rpos_inc, &rtype)) { case 5: /* type gets ignored at this stage */ case 4: token->pos_inc = FIX2INT(rpos_inc); } token->text = rb_obj_as_string(rtext); token->start = FIX2INT(rstart); token->end = FIX2INT(rend); return self; }