class Lingo::Attendee::VectorFilter
Constants
- DEFAULT_DICT_SEPARATOR
- DEFAULT_GENDER_SEPARATOR
- DEFAULT_POS_SEPARATOR
- DEFAULT_SRC_SEPARATOR
Public Instance Methods
control(cmd, *)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 131 def control(cmd, *) case cmd when :EOL then :skip_command when *TERMINALS then send_vectors unless @docnum end end
control_deferred(cmd, *)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 138 def control_deferred(cmd, *) @docnum += 1 if TERMINALS.include?(cmd) end
init()
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 92 def init @lex = get_re('lexicals', '[sy]') @skip = get_ary('skip', DEFAULT_SKIP, :upcase) @src = @pos = @sort_fmt = @sort_rel = @docnum = nil @tokens, @vectors, @word_count = [], Hash.array(1), Hash.new(0) if @dict = get_key('dict', false) @norm = get_key('norm', false) @dict = DEFAULT_DICT_SEPARATOR if @dict == true else @src = get_key('src', false) @src = DEFAULT_SRC_SEPARATOR if @src == true @pos = get_key('pos', false) @pos = DEFAULT_POS_SEPARATOR if @pos == true @tokens = get_ary('tokens', '', :upcase) @tokens.concat(Tokenizer.rules) if @tokens.delete('ALL') end if sort = get_key('sort', ENV['LINGO_NO_SORT'] ? false : 'normal') @sort_fmt, sort_method = sort.downcase.split('_', 2) @sort_rel = rel = sort_method == 'rel' unless @sort_fmt == 'normal' if @tfidf = get_key('tfidf', false) DeferredAttendee.enhance(self) @docnum, rel = 0, true end _sort_fmt = @sort_fmt == 'sto' ? '%2$s {%1$X}' : '%X %s' @sort_fmt = _sort_fmt.sub('X', rel ? '.5f' : 'd') end end end
process(obj)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 142 def process(obj) if obj.is_a?(Token) return unless @tokens.include?(obj.attr) elsif obj.is_a?(Word) return if @skip.include?(obj.attr) else return end @word_count[@docnum] += 1 @dict ? forward_dict(obj) : begin pos = obj.position_and_offset if @pos obj.is_a?(Token) ? forward_vector(obj, pos) : obj.each_lex(@lex) { |lex| forward_vector(lex, pos, lex.src) } end end
Private Instance Methods
flush_vectors(*args, &block)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 229 def flush_vectors(*args, &block) map_vectors(*args, &block) .sort_by { |w, v| [-w, v] } .each { |vec| forward(@sort_fmt % vec) } end
forward_dict(obj, sep = DEFAULT_GENDER_SEPARATOR)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 171 def forward_dict(obj, sep = DEFAULT_GENDER_SEPARATOR) vectors = obj.each_lex(@lex).map { |lex| "#{lex.form} ##{lex.attr}".tap { |str| str << sep << lex.gender if lex.gender } } unless vectors.empty? vec = @norm ? obj.lex_form : obj.form forward_vector("#{vec}#{@dict}#{vectors.join(' ')}") end end
forward_vector(vec, pos = nil, src = nil)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 184 def forward_vector(vec, pos = nil, src = nil) vec = vec.form if vec.is_a?(WordForm) vec = Unicode.downcase(vec) if @src && src src = src.form if src.is_a?(Token) vec << @src << src end @sort_fmt ? vectors[vec] << pos : forward(vec_pos(vec, [pos])) end
map_vectors(wc = 1, docnum = nil) { |size| ... }
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 222 def map_vectors(wc = 1, docnum = nil) v = vectors(docnum) v.map { |vec, pos| yield pos.size / wc.to_f, vec, vec_pos(vec, pos) } ensure v.clear if v end
send_vectors() { |lambda| ... }
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 196 def send_vectors if @docnum df, abs = Hash.new(0), @sort_rel ? nil : 1 @vectors.each_value { |w| w.each_key { |v| df[v] += 1 } } if @tfidf.is_a?(String) open_csv(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } } end yield lambda { |docnum| wc = abs || word_count(docnum) flush_vectors(wc, docnum) { |c, v, vp| [c / df[v], vp] } } elsif @sort_fmt == 'normal' flush(map_vectors { |_, _, vp| vp }.sort!) else flush_vectors(@sort_rel ? word_count : 1) { |c, _, vp| [c, vp] } end @word_count.clear @vectors.clear end
Also aliased as: flush_deferred
vec_pos(vec, pos)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 235 def vec_pos(vec, pos) pos.clear unless @pos pos.compact! pos.uniq! pos.empty? ? vec : "#{vec}#{@pos}#{pos.join(',')}" end
vectors(docnum = nil)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 163 def vectors(docnum = nil) @vectors[docnum || @docnum] end
word_count(docnum = nil)
click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 167 def word_count(docnum = nil) @word_count[docnum || @docnum] end