mecab japaneseIn.txt | egrep -v "^EOS" | egrep -v "記号|\w助詞" | sed "s/\t.*//" | sort | uniq -c | sort -n -r | grep -v "^\s*[0-9]*\s*[0-9a-zA-Z,./\\<>?_;:@{}^~。*%()\-]*$" > wordListOut.txt mecab Asic-wiki.txt | grep -v "記号,.*,\*,\*" | grep -v "名詞,数,\*,\*,\*,\*,\*" | grep -v "名詞,サ変接続,\*,\*,\*,\*,\*" | grep -v "^EOS"
verbs;
mecab Asic-wiki.txt | grep "助詞"
verb(dictionary form) occurance count;
mecab Asic-wiki.txt | grep "動詞" | egrep "一段|五段" | sed "s/.*[一五]段[^,]*,[^,]*,\([^,]*\),.*/\1/" | sort | uniq -c | sort -n -r
verbs transform occurance count;
mecab Asic-wiki.txt | grep "動詞" | egrep "一段|五段" | sed "s/.*[一五]段[^,]*,\([^,]*\),.*/\1/" | sort | uniq -c | sort -n -r
of course these are hacky 1 liners i have a ruby fuction that parsers the file and gives me the various results
the critical column is given by this
mecab Asic-wiki.txt | sed "s/[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,[^,]*,\([^,]*\).*/\1/"
No comments:
Post a Comment