admin管理员组

文章数量:1313769

I am trying to manipulate two dataframes called df1 and df2, which have the following structures:

df1 <- data.frame(center_x = c(1, 2, 3),
                  center_y = c(4, 5, 6),
                  label = c(1, 2, 3))

df2 <- data.frame(x = c(1, 2),
                  y = c(1, 1),
                  name = c("A", "B"))

I also have this formula euc.dist

euc.dist <- function(v1, v2){
  
  x.val <- ( v1[1] - v2[1] )^2
  y.val <- ( v1[2] - v2[2] )^2
  
  d.val <- sqrt( x.val + y.val )
  
  ## Add d.val to v1 input as column d.val
  out.df <- cbind(v1, as.numeric(d.val))
  
  ## Rename the last column of out.df as d.val
  names(out.df)[length(out.df)] <- "d.val"
  
  ## Return out.df
  return(out.df)
  
}

I want to have an output that looks like this across all the values in df1, stored as df3

R Code

x <- euc.dist(v1 = df1[1, ], v2 = df2[1, ])
y <- euc.dist(v1 = df1[1, ], v2 = df2[2, ])

df3 <- rbind(x , y)

df3

  center_x center_y label    d.val
1        1        4     1 3.000000
2        1        4     1 3.162278

I tried using apply, but it does not look like this:

> apply(df1, 1,
+       function (x) euc.dist(v1 = x,
+                             df2[1, ])) |>
+     t()

How can I change this apply() function?

I am trying to manipulate two dataframes called df1 and df2, which have the following structures:

df1 <- data.frame(center_x = c(1, 2, 3),
                  center_y = c(4, 5, 6),
                  label = c(1, 2, 3))

df2 <- data.frame(x = c(1, 2),
                  y = c(1, 1),
                  name = c("A", "B"))

I also have this formula euc.dist

euc.dist <- function(v1, v2){
  
  x.val <- ( v1[1] - v2[1] )^2
  y.val <- ( v1[2] - v2[2] )^2
  
  d.val <- sqrt( x.val + y.val )
  
  ## Add d.val to v1 input as column d.val
  out.df <- cbind(v1, as.numeric(d.val))
  
  ## Rename the last column of out.df as d.val
  names(out.df)[length(out.df)] <- "d.val"
  
  ## Return out.df
  return(out.df)
  
}

I want to have an output that looks like this across all the values in df1, stored as df3

R Code

x <- euc.dist(v1 = df1[1, ], v2 = df2[1, ])
y <- euc.dist(v1 = df1[1, ], v2 = df2[2, ])

df3 <- rbind(x , y)

df3

  center_x center_y label    d.val
1        1        4     1 3.000000
2        1        4     1 3.162278

I tried using apply, but it does not look like this:

> apply(df1, 1,
+       function (x) euc.dist(v1 = x,
+                             df2[1, ])) |>
+     t()

How can I change this apply() function?

Share Improve this question edited Jan 31 at 21:41 cfausto asked Jan 31 at 21:14 cfaustocfausto 531 silver badge4 bronze badges 0
Add a comment  | 

2 Answers 2

Reset to default 2

The original question asked for labels (i.e. indexes) and used the data shown in the Note at the end but then changed it to ask for distances. We will use the data in the Note at the end and show how to get indexes and what modification is needed to get distances.

1) dista in Rfast will compute indexes of the shortest distance between two data frames or if you want distances remove the index=TRUE argument.

library(Rfast)

transform(mols.test, 
  wx = dista(mols.test[1:2], cells.test[1:2], k = 1, index = TRUE))

2) dist in proxy gives the 2x3 matrix of distances and then we use apply to get the column indexes, wx, of the shortest distance columns or if you want distances replaces which.min with min.

library(proxy)

transform(mols.test,
  wx = apply(dist(mols.test[1:2], cells.test[1:2]), 1, which.min))

3) This forms the 2x3 matrix of distances using uec.dist from the question and then uses apply to get the indexes. Replace which.min with min if you prefer to get distances.

f <- Vectorize(function(i, j) euc.dist(mols.test[i, ], cells.test[j, ])$d.val )

nr1 <- nrow(mols.test)
nr2 <- nrow(cells.test)
transform(mols.test, wx = apply(outer(1:nr1, 1:nr2, f), 1, which.min))

4) We can use a nearest neighbour algorithm. This gives both the index in column nn.idx and the distances in column nn.dists.

library(RANN)

cbind(mols.test, nn2(cells.test[1:2], mols.test[1:2], k = 1))

5) If we just need the index then knn1 in the class package which comes preinstalled with R so it does not have to be installed could be used.

library(class)

transform(mols.test,
  wx = knn1(cells.test[1:2], mols.test[1:2], 1:nrow(cells.test)))

or if we need distances once we have the indexes then using euc.dist is easier:

library(class)

nr1 <- nrow(mols.test)
nr2 <- nrow(cells.test)

ix <-  knn1(cells.test[1:2], mols.test[1:2], 1:nr2)
f <- \(i, j) euc.dist(mols.test[i, 1:2], cells.test[j, 1:2])$d.val
transform(mols.test, wx = mapply(f, 1:nr1, ix))

Note

Questions to SO should provide the input in an easily reproducible manner so that others can copy and paste it into their session but we have done it for you this time.

cells.test <- data.frame(
  center_x = c(44173L, 41618L, 43681L),
  center_y = c(1763L, 1835L, 1824L),
  label = 1:3,
  area = c(15036L, 20343L, 13258L)
)

mols.test <- data.frame(
  x = c(14948L, 27614L),
  y = c(15230L, 6906L),
  name = c("UMOD", "UMOD"),
  z = c(1L, 1L)
)

You can try

v1 <- c(as.matrix(df1[-3]) %*% c(1, 1i))
v2 <- c(as.matrix(df2[-3]) %*% c(1, 1i))
cbind(
    df1[rep(1:nrow(df1), each = nrow(df2)), ],
    d.val = c(abs(outer(v2, v1, `-`)))
)

which gives

    center_x center_y label    d.val
1          1        4     1 3.000000
1.1        1        4     1 3.162278
2          2        5     2 4.123106
2.1        2        5     2 4.000000
3          3        6     3 5.385165
3.1        3        6     3 5.099020

本文标签: