class Lingo::Attendee::VectorFilter

Constants

DEFAULT_DICT_SEPARATOR
DEFAULT_GENDER_SEPARATOR
DEFAULT_POS_SEPARATOR
DEFAULT_SRC_SEPARATOR

Public Instance Methods

control(cmd, *) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 131
def control(cmd, *)
  case cmd
    when :EOL       then :skip_command
    when *TERMINALS then send_vectors unless @docnum
  end
end
control_deferred(cmd, *) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 138
def control_deferred(cmd, *)
  @docnum += 1 if TERMINALS.include?(cmd)
end
init() click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 92
def init
  @lex  = get_re('lexicals', '[sy]')
  @skip = get_ary('skip', DEFAULT_SKIP, :upcase)

  @src = @pos = @sort_fmt = @sort_rel = @docnum = nil

  @tokens, @vectors, @word_count = [], Hash.array(1), Hash.new(0)

  if @dict = get_key('dict', false)
    @norm = get_key('norm', false)
    @dict = DEFAULT_DICT_SEPARATOR if @dict == true
  else
    @src = get_key('src', false)
    @src = DEFAULT_SRC_SEPARATOR if @src == true

    @pos = get_key('pos', false)
    @pos = DEFAULT_POS_SEPARATOR if @pos == true

    @tokens = get_ary('tokens', '', :upcase)
    @tokens.concat(Tokenizer.rules) if @tokens.delete('ALL')
  end

  if sort = get_key('sort', ENV['LINGO_NO_SORT'] ? false : 'normal')
    @sort_fmt, sort_method = sort.downcase.split('_', 2)

    @sort_rel = rel = sort_method == 'rel'

    unless @sort_fmt == 'normal'
      if @tfidf = get_key('tfidf', false)
        DeferredAttendee.enhance(self)
        @docnum, rel = 0, true
      end

      _sort_fmt = @sort_fmt == 'sto' ? '%2$s {%1$X}' : '%X %s'
      @sort_fmt = _sort_fmt.sub('X', rel ? '.5f' : 'd')
    end
  end
end
process(obj) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 142
def process(obj)
  if obj.is_a?(Token)
    return unless @tokens.include?(obj.attr)
  elsif obj.is_a?(Word)
    return if @skip.include?(obj.attr)
  else
    return
  end

  @word_count[@docnum] += 1

  @dict ? forward_dict(obj) : begin
    pos = obj.position_and_offset if @pos

    obj.is_a?(Token) ? forward_vector(obj, pos) :
      obj.each_lex(@lex) { |lex| forward_vector(lex, pos, lex.src) }
  end
end

Private Instance Methods

flush_deferred()
Alias for: send_vectors
flush_vectors(*args, &block) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 229
def flush_vectors(*args, &block)
  map_vectors(*args, &block)
    .sort_by { |w, v| [-w, v] }
    .each { |vec| forward(@sort_fmt % vec) }
end
forward_dict(obj, sep = DEFAULT_GENDER_SEPARATOR) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 171
def forward_dict(obj, sep = DEFAULT_GENDER_SEPARATOR)
  vectors = obj.each_lex(@lex).map { |lex|
    "#{lex.form} ##{lex.attr}".tap { |str|
      str << sep << lex.gender if lex.gender
    }
  }

  unless vectors.empty?
    vec = @norm ? obj.lex_form : obj.form
    forward_vector("#{vec}#{@dict}#{vectors.join(' ')}")
  end
end
forward_vector(vec, pos = nil, src = nil) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 184
def forward_vector(vec, pos = nil, src = nil)
  vec = vec.form if vec.is_a?(WordForm)
  vec = Unicode.downcase(vec)

  if @src && src
    src = src.form if src.is_a?(Token)
    vec << @src << src
  end

  @sort_fmt ? vectors[vec] << pos : forward(vec_pos(vec, [pos]))
end
map_vectors(wc = 1, docnum = nil) { |size| ... } click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 222
def map_vectors(wc = 1, docnum = nil)
  v = vectors(docnum)
  v.map { |vec, pos| yield pos.size / wc.to_f, vec, vec_pos(vec, pos) }
ensure
  v.clear if v
end
send_vectors() { |lambda| ... } click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 196
def send_vectors
  if @docnum
    df, abs = Hash.new(0), @sort_rel ? nil : 1

    @vectors.each_value { |w| w.each_key { |v| df[v] += 1 } }

    if @tfidf.is_a?(String)
      open_csv(@tfidf, 'wb') { |c| df.sort.each { |v| c << v } }
    end

    yield lambda { |docnum|
      wc = abs || word_count(docnum)
      flush_vectors(wc, docnum) { |c, v, vp| [c / df[v], vp] }
    }
  elsif @sort_fmt == 'normal'
    flush(map_vectors { |_, _, vp| vp }.sort!)
  else
    flush_vectors(@sort_rel ? word_count : 1) { |c, _, vp| [c, vp] }
  end

  @word_count.clear
  @vectors.clear
end
Also aliased as: flush_deferred
vec_pos(vec, pos) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 235
def vec_pos(vec, pos)
  pos.clear unless @pos

  pos.compact!
  pos.uniq!
  pos.empty? ? vec : "#{vec}#{@pos}#{pos.join(',')}"
end
vectors(docnum = nil) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 163
def vectors(docnum = nil)
  @vectors[docnum || @docnum]
end
word_count(docnum = nil) click to toggle source
# File lib/lingo/attendee/vector_filter.rb, line 167
def word_count(docnum = nil)
  @word_count[docnum || @docnum]
end