Unsupervised learning

M. Benesty

2019-03-10

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   14085 lr:  0.000000 loss:  2.735799 ETA:   0h 0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##              [,1]        [,2]       [,3]       [,4]       [,5]
## time   0.03754644  0.04472022 -0.1191465 0.03307503 0.09531306
## timing 0.07334084 -0.02790558 -0.1841608 0.06639251 0.05460514
##                [,6]      [,7]         [,8]        [,9]      [,10]
## time   -0.003451684 0.0987045 -0.001799128 -0.04122123 -0.2789923
## timing  0.001954935 0.1083369  0.006581147  0.01148378 -0.2950721
##             [,11]       [,12]       [,13]        [,14]     [,15]     [,16]
## time   -0.1114878 -0.06136598 -0.12692995 -0.033035718 0.1602561 0.1819595
## timing -0.1528243 -0.02683910 -0.04586432 -0.006941847 0.1877923 0.1238436
##              [,17]      [,18]      [,19]      [,20]      [,21]      [,22]
## time   -0.03596105 -0.2016186 -0.2101724 0.08240323 -0.2503305 -0.0445167
## timing  0.05619359 -0.1557516 -0.2415527 0.07812360 -0.2731776 -0.0676725
##            [,23]     [,24]       [,25]      [,26]     [,27]      [,28]
## time   0.1572251 0.1617647 -0.06774928 -0.1734825 0.2782832 -0.2015833
## timing 0.1365548 0.1787091 -0.02061462 -0.1838940 0.2682680 -0.2212698
##             [,29]       [,30]      [,31]       [,32]      [,33]     [,34]
## time   0.10875557 -0.07843916 -0.1502843  0.01841240 -0.4211736 0.2542137
## timing 0.08099189 -0.03207089 -0.1386640 -0.02419036 -0.4641984 0.3166542
##              [,35]      [,36]      [,37]      [,38]      [,39]      [,40]
## time    0.01928528 -0.1512249 -0.3161519 -0.1299659 -0.0375267 -0.2170463
## timing -0.06774233 -0.1079156 -0.3149860 -0.1222331  0.0527419 -0.2442012
##              [,41]     [,42]     [,43]      [,44]      [,45]      [,46]
## time   -0.04962618 0.3314911 0.4370219 0.08671725 -0.1727597 -0.3335396
## timing -0.08149398 0.4144699 0.3825505 0.07200103 -0.1836946 -0.2818088
##              [,47]      [,48]      [,49]      [,50]       [,51]
## time   -0.05624731 0.08083803 -0.1317770 0.07230521 -0.09932293
## timing -0.08051181 0.03614210 -0.1806652 0.06604423 -0.08535457
##              [,52]     [,53]      [,54]      [,55]      [,56]       [,57]
## time   -0.11461210 0.2607400 0.08632218 -0.1488330 -0.2531838 -0.03895384
## timing -0.07844964 0.3223052 0.05222030 -0.1717677 -0.2205622 -0.02333721
##             [,58]      [,59]     [,60]      [,61]      [,62]       [,63]
## time   -0.3424254 0.04278037 0.1689096 -0.1056348 -0.1434594 -0.02822483
## timing -0.2967960 0.02236059 0.1672297 -0.1252252 -0.1686832 -0.02320290
##            [,64]       [,65]      [,66]       [,67]      [,68]       [,69]
## time   0.2836030 -0.05280998 -0.3436534 -0.02034674 -0.2993127 -0.05885123
## timing 0.1993495 -0.07640383 -0.3430315 -0.02444376 -0.2476875 -0.06625663
##              [,70]       [,71]     [,72]      [,73]      [,74]      [,75]
## time   -0.04588715 -0.02427904 0.1638719 -0.1058129 -0.1422997 0.03128994
## timing -0.08237944 -0.02333532 0.1505387 -0.1163863 -0.1464228 0.03218240
##               [,76]       [,77]     [,78]      [,79]     [,80]       [,81]
## time   0.0224461202 -0.01795163 0.2611265 -0.1362406 0.3668818 -0.04946095
## timing 0.0003987909 -0.01808129 0.2468416 -0.2027739 0.3490791  0.02447402
##             [,82]      [,83]       [,84]        [,85]       [,86]
## time   0.08991257 -0.1858367 -0.05103145 -0.002523978 -0.02535998
## timing 0.06575561 -0.2205030 -0.06934454 -0.009881624 -0.01118312
##            [,87]       [,88]      [,89]         [,90]     [,91]     [,92]
## time   0.1771359 0.071123928 -0.2513928 -0.0019195473 0.2960517 0.1136135
## timing 0.1984777 0.003042816 -0.2696292  0.0005993398 0.3349214 0.1100874
##            [,93]       [,94]      [,95]      [,96]      [,97]      [,98]
## time   0.1573669 -0.10728244 -0.2556348 0.04110903 -0.2052287 -0.1840108
## timing 0.1239912 -0.04266323 -0.2853611 0.11719076 -0.1944775 -0.1793768
##              [,99]       [,100]
## time   -0.06497227 -0.001372997
## timing -0.04601051  0.001257821
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.02568624
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  579691 31.0    1173328 62.7  1173328 62.7
## Vcells 1250580  9.6    8388608 64.0  1758476 13.5