Unsupervised learning

M. Benesty

2018-01-04

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   18692 lr:  0.000000 loss:  2.815933 ETA:   0h 0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##              [,1]       [,2]        [,3]       [,4]      [,5]       [,6]
## time   0.10612551 0.07148871 -0.01400366 0.08170515 0.2107516 -0.0856003
## timing 0.09404954 0.01110726 -0.07381661 0.14503674 0.1630365 -0.0743561
##              [,7]        [,8]     [,9]      [,10]      [,11]       [,12]
## time   0.06452757 -0.09309069 0.122324 -0.1623787 -0.1547531 0.004745292
## timing 0.07033922 -0.08068457 0.136004 -0.1814308 -0.1790269 0.007925374
##              [,13]        [,14]      [,15]     [,16]      [,17]
## time   -0.12772675 -0.037346292 0.01983134 0.2867689 0.06779584
## timing -0.05574913 -0.002046481 0.07241297 0.2812291 0.12423315
##              [,18]      [,19]    [,20]      [,21]      [,22]       [,23]
## time   -0.12172756 -0.1413808 0.294580 -0.1018153 -0.1055674 -0.04908620
## timing -0.09207927 -0.1609025 0.343444 -0.1130185 -0.1195261 -0.06930766
##            [,24]       [,25]        [,26]     [,27]      [,28]     [,29]
## time   0.1014775 -0.06203054  0.024576681 0.3084509 -0.1508369 0.2751614
## timing 0.1310402 -0.04119042 -0.005296993 0.2946844 -0.1915578 0.3021637
##            [,30]      [,31]     [,32]      [,33]     [,34]      [,35]
## time   0.2015223 -0.1567805 0.1789271 -0.2121336 0.1109511 0.17252681
## timing 0.2049126 -0.1890568 0.1500673 -0.2708564 0.1567058 0.09886294
##               [,36]      [,37]       [,38]      [,39]      [,40]
## time   -0.049850523 -0.2123505 -0.13707161 -0.1758844 -0.3161319
## timing  0.005593059 -0.2113849 -0.09012935 -0.1475967 -0.3382239
##              [,41]      [,42]     [,43]      [,44]      [,45]       [,46]
## time   -0.08585676 0.06876358 0.3835841 0.03887715 -0.1559960 -0.09014536
## timing -0.06691644 0.13491063 0.3886381 0.05718888 -0.1793154 -0.06344484
##               [,47]     [,48]      [,49]     [,50]     [,51]        [,52]
## time   -0.004909037 0.2260344 -0.1501742 0.1084805 0.1115248 -0.001711588
## timing -0.021227960 0.1787634 -0.1597113 0.1651420 0.1235560  0.015674848
##            [,53]     [,54]      [,55]      [,56]       [,57]      [,58]
## time   0.2276037 0.1644684 -0.3088567 0.03679692 -0.04613351 -0.3499355
## timing 0.3194697 0.1295134 -0.3159224 0.05223772 -0.03112178 -0.3311987
##             [,59]      [,60]      [,61]      [,62]      [,63]     [,64]
## time   0.07351421 0.08378514 -0.1161545 0.06963819 -0.1031450 0.3446571
## timing 0.04180297 0.08577771 -0.1244836 0.06173485 -0.1001682 0.3035475
##             [,65]      [,66]       [,67]       [,68]        [,69]
## time   0.14940004 -0.1716473  0.01368067 -0.08428457 -0.009838303
## timing 0.07556404 -0.1996288 -0.01908610 -0.05758634 -0.037646051
##             [,70]         [,71]     [,72]        [,73]      [,74]
## time   0.08910245 -0.0124581112 0.1475237 -0.005907365 -0.3198989
## timing 0.04353718  0.0006954131 0.1313596  0.016816046 -0.2922211
##             [,75]      [,76]      [,77]        [,78]         [,79]
## time   0.03249507 0.01471741 0.01742277  0.001484244  0.0426888950
## timing 0.01686049 0.05143612 0.06074193 -0.084721111 -0.0004637837
##             [,80]       [,81]       [,82]      [,83]     [,84]       [,85]
## time   0.04099551 -0.09909890 0.039027784 -0.2211936 0.2067799 -0.07377546
## timing 0.02473989 -0.02969332 0.005983395 -0.2555504 0.2272858 -0.10093042
##             [,86]      [,87]     [,88]      [,89]       [,90]     [,91]
## time   -0.1851192 0.10437235 0.1726995 -0.1654970 -0.01782419 0.2476783
## timing -0.2301955 0.09857988 0.1706954 -0.1782061  0.01217376 0.2932041
##            [,92]     [,93]      [,94]      [,95]      [,96]       [,97]
## time   0.2673696 0.1590543 -0.1627807 -0.2106414 0.02332520 -0.12616825
## timing 0.2699354 0.1204549 -0.1377168 -0.2183254 0.06358713 -0.09300115
##             [,98]      [,99]      [,100]
## time   -0.1453499 -0.1403460 0.004437986
## timing -0.1514001 -0.1157987 0.027878668
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.02508389
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  559133 29.9     940480 50.3   940480 50.3
## Vcells 1160819  8.9    1943194 14.9  1548727 11.9