Unsupervised learning

M. Benesty

2019-05-30

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   29766 lr:  0.000000 loss:  2.690165 ETA:   0h 0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##              [,1]        [,2]        [,3]       [,4]      [,5]       [,6]
## time   0.05088438  0.05607361 -0.01896769 0.04886409 0.1809831 0.03411374
## timing 0.08077801 -0.01604572 -0.07798109 0.09950676 0.1475459 0.02544693
##              [,7]        [,8]       [,9]      [,10]       [,11]
## time   0.03292818 -0.03469704 0.03935269 -0.2035902 -0.07952464
## timing 0.05079525 -0.03046088 0.08975574 -0.2459510 -0.12654993
##              [,12]      [,13]      [,14]     [,15]     [,16]        [,17]
## time   -0.09481326 -0.1453379 -0.1119982 0.1412176 0.2050022 -0.001779601
## timing -0.07197757 -0.0954503 -0.1171161 0.1875602 0.1490941  0.081912234
##             [,18]      [,19]      [,20]      [,21]       [,22]     [,23]
## time   -0.2560630 -0.2224898 0.08816943 -0.2740619 -0.07579061 0.1389103
## timing -0.2338775 -0.2597868 0.07754761 -0.3025244 -0.09879884 0.1237624
##            [,24]       [,25]      [,26]     [,27]      [,28]     [,29]
## time   0.1371472 -0.11411522 -0.1091941 0.3458687 -0.1838181 0.1357762
## timing 0.1603793 -0.05737492 -0.1182581 0.3406205 -0.1989422 0.1150411
##               [,30]      [,31]      [,32]      [,33]     [,34]
## time   -0.046432931 -0.1758087 0.08345105 -0.3543756 0.2779189
## timing -0.001023501 -0.1703276 0.04046606 -0.4173390 0.3626297
##               [,35]      [,36]      [,37]      [,38]       [,39]
## time    0.070349418 -0.1543195 -0.2717811 -0.1416332 -0.06034501
## timing -0.008522567 -0.1311772 -0.2738557 -0.1493265  0.01033850
##             [,40]       [,41]     [,42]     [,43]     [,44]      [,45]
## time   -0.2026320 -0.09996526 0.2747053 0.3319716 0.1292427 -0.2436440
## timing -0.2159843 -0.12653658 0.3581319 0.2775868 0.1155489 -0.2795126
##             [,46]       [,47]      [,48]      [,49]      [,50]       [,51]
## time   -0.3593859 -0.06599605 0.10233121 -0.1634134 0.07284379 -0.03536733
## timing -0.3264923 -0.09049401 0.05446184 -0.2197587 0.06682438 -0.02059510
##              [,52]     [,53]       [,54]      [,55]      [,56]       [,57]
## time   -0.08436604 0.2062258 0.040048089 -0.1984321 -0.2104268 -0.09096450
## timing -0.05459446 0.2722274 0.004775394 -0.2154241 -0.1934104 -0.08257532
##             [,58]      [,59]     [,60]      [,61]      [,62]       [,63]
## time   -0.3598753 0.04837102 0.2240054 -0.1037931 -0.1671340 0.009947983
## timing -0.3366675 0.02448860 0.2357042 -0.1163150 -0.1917544 0.005600205
##            [,64]       [,65]      [,66]      [,67]      [,68]      [,69]
## time   0.3269331 -0.05644738 -0.2745706 0.06729551 -0.2165707 -0.1050965
## timing 0.2434513 -0.09033417 -0.2636166 0.07117067 -0.1692644 -0.1177218
##              [,70]       [,71]     [,72]       [,73]      [,74]
## time    0.01472164 -0.03982066 0.1404141 -0.05828417 -0.1614093
## timing -0.02009961 -0.03726767 0.1344493 -0.05893959 -0.1574391
##              [,75]        [,76]       [,77]     [,78]       [,79]
## time   -0.03005699 -0.008173962 -0.03532284 0.2293657 -0.02563534
## timing -0.03350902 -0.021316521 -0.03361278 0.2210726 -0.09744229
##            [,80]       [,81]       [,82]      [,83]       [,84]
## time   0.2868811 -0.11741390 0.018015111 -0.1461677 -0.06136803
## timing 0.2792849 -0.04931451 0.002012071 -0.1684375 -0.07403993
##              [,85]        [,86]     [,87]      [,88]      [,89]      [,90]
## time   -0.05086259 -0.002902606 0.1775235 0.10358354 -0.2381248 0.02890908
## timing -0.05562060  0.013138759 0.2008613 0.04965838 -0.2423464 0.02622315
##            [,91]      [,92]     [,93]       [,94]      [,95]     [,96]
## time   0.2659948 0.09492836 0.1578416 -0.08241174 -0.1661093 0.0765876
## timing 0.3010473 0.08637539 0.1364994 -0.02004028 -0.2006530 0.1520273
##             [,97]      [,98]       [,99]       [,100]
## time   -0.2379430 -0.1550552 -0.08563807 -0.012569305
## timing -0.2416937 -0.1549676 -0.05896453 -0.008807855
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.02491287
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  578965 31.0    1170568 62.6  1170568 62.6
## Vcells 1249027  9.6    8388608 64.0  1752142 13.4