Skip to content

Commit bef3a2e

Browse files
committed
CLOUDSTACK-1191: rbd: Use cloning for deploying templates instead of a copy
RBD format 2 supports cloning (aka layering) where one base image can serve as a parent image for multiple child images. This enables fast deployment of a large amount of virtual machines, but it also saves spaces on the Ceph cluster and improves performance due to better caching. Qemu-img doesn't support RBD format 2 (yet), so to enable these functions the RADOS/RBD Java bindings are required. This patch also enables deployment of System VMs on RBD storage pools. Since we no longer require a patchdisk for passing the boot arguments we are able to deploy these VMs on RBD.
1 parent ed2ce27 commit bef3a2e

File tree

5 files changed

+246
-25
lines changed

5 files changed

+246
-25
lines changed

engine/storage/src/org/apache/cloudstack/storage/allocator/AbstractStoragePoolAllocator.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -167,12 +167,6 @@ protected boolean filter(ExcludeList avoid, StoragePool pool, DiskProfile dskCh,
167167
return false;
168168
}
169169

170-
DiskOfferingVO diskOffering = _diskOfferingDao.findById(dskCh.getDiskOfferingId());
171-
if (diskOffering.getSystemUse() && pool.getPoolType() == StoragePoolType.RBD) {
172-
s_logger.debug("Skipping RBD pool " + pool.getName() + " as a suitable pool. RBD is not supported for System VM's");
173-
return false;
174-
}
175-
176170

177171
Long clusterId = pool.getClusterId();
178172
ClusterVO cluster = _clusterDao.findById(clusterId);

plugins/hypervisors/kvm/pom.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
<id>libvirt-org</id>
2525
<url>http://libvirt.org/maven2</url>
2626
</repository>
27+
<repository>
28+
<id>ceph-com</id>
29+
<url>http://ceph.com/maven</url>
30+
</repository>
2731
</repositories>
2832
<dependencies>
2933
<dependency>
@@ -36,6 +40,11 @@
3640
<artifactId>libvirt</artifactId>
3741
<version>${cs.libvirt-java.version}</version>
3842
</dependency>
43+
<dependency>
44+
<groupId>com.ceph</groupId>
45+
<artifactId>rados</artifactId>
46+
<version>${cs.rados-java.version}</version>
47+
</dependency>
3948
</dependencies>
4049
<build>
4150
<defaultGoal>install</defaultGoal>

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,6 +1253,13 @@ private CheckNetworkAnswer execute(CheckNetworkCommand cmd) {
12531253
}
12541254

12551255
private CopyVolumeAnswer execute(CopyVolumeCommand cmd) {
1256+
/**
1257+
This method is only used for copying files from Primary Storage TO Secondary Storage
1258+
1259+
It COULD also do it the other way around, but the code in the ManagementServerImpl shows
1260+
that it always sets copyToSecondary to true
1261+
1262+
*/
12561263
boolean copyToSecondary = cmd.toSecondaryStorage();
12571264
String volumePath = cmd.getVolumePath();
12581265
StorageFilerTO pool = cmd.getPool();

plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/storage/LibvirtStorageAdaptor.java

Lines changed: 229 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
package com.cloud.hypervisor.kvm.storage;
1818

1919
import java.io.File;
20+
import java.io.FileInputStream;
21+
import java.io.BufferedInputStream;
22+
import java.io.IOException;
2023
import java.net.URI;
2124
import java.net.URISyntaxException;
2225
import java.util.ArrayList;
@@ -37,6 +40,12 @@
3740
import org.libvirt.StoragePoolInfo;
3841
import org.libvirt.StorageVol;
3942
import org.libvirt.StoragePoolInfo.StoragePoolState;
43+
import com.ceph.rados.Rados;
44+
import com.ceph.rados.RadosException;
45+
import com.ceph.rados.IoCTX;
46+
import com.ceph.rbd.Rbd;
47+
import com.ceph.rbd.RbdImage;
48+
import com.ceph.rbd.RbdException;
4049

4150
import com.cloud.agent.api.ManageSnapshotCommand;
4251
import com.cloud.hypervisor.kvm.resource.LibvirtConnection;
@@ -63,6 +72,8 @@ public class LibvirtStorageAdaptor implements StorageAdaptor {
6372
private String _mountPoint = "/mnt";
6473
private String _manageSnapshotPath;
6574

75+
private String rbdTemplateSnapName = "cloudstack-base-snap";
76+
6677
public LibvirtStorageAdaptor(StorageLayer storage) {
6778
_storageLayer = storage;
6879
_manageSnapshotPath = Script.findScript("scripts/storage/qcow2/",
@@ -638,6 +649,15 @@ public boolean deletePhysicalDisk(String uuid, KVMStoragePool pool) {
638649
}
639650
}
640651

652+
/**
653+
* This function copies a physical disk from Secondary Storage to Primary Storage
654+
* or from Primary to Primary Storage
655+
*
656+
* The first time a template is deployed in Primary Storage it will be copied from
657+
* Secondary to Primary.
658+
*
659+
* If it has been created on Primary Storage, it will be copied on the Primary Storage
660+
*/
641661
@Override
642662
public KVMPhysicalDisk createDiskFromTemplate(KVMPhysicalDisk template,
643663
String name, PhysicalDiskFormat format, long size, KVMStoragePool destPool) {
@@ -690,21 +710,118 @@ public KVMPhysicalDisk createDiskFromTemplate(KVMPhysicalDisk template,
690710

691711
if (srcPool.getType() != StoragePoolType.RBD) {
692712
srcFile = new QemuImgFile(template.getPath(), template.getFormat());
713+
qemu.convert(srcFile, destFile);
693714
} else {
694-
template.setFormat(PhysicalDiskFormat.RAW);
695-
srcFile = new QemuImgFile(KVMPhysicalDisk.RBDStringBuilder(srcPool.getSourceHost(),
696-
srcPool.getSourcePort(),
697-
srcPool.getAuthUserName(),
698-
srcPool.getAuthSecret(),
699-
template.getPath()));
700-
srcFile.setFormat(template.getFormat());
715+
716+
/**
717+
* We have to find out if the source file is in the same RBD pool and has
718+
* RBD format 2 before we can do a layering/clone operation on the RBD image
719+
*
720+
* This will be the case when the template is already on Primary Storage and
721+
* we want to copy it
722+
*/
723+
724+
/* Feature 1<<0 means layering in RBD format 2 */
725+
int rbdFeatures = (1<<0);
726+
/* Order 0 means 4MB blocks (the default) */
727+
int rbdOrder = 0;
728+
729+
try {
730+
if ((srcPool.getSourceHost().equals(destPool.getSourceHost())) && (srcPool.getSourceDir().equals(destPool.getSourceDir()))) {
731+
/* We are on the same Ceph cluster, but we require RBD format 2 on the source image */
732+
s_logger.debug("Trying to perform a RBD clone (layering) since we are operating in the same storage pool");
733+
734+
Rados r = new Rados(srcPool.getAuthUserName());
735+
r.confSet("mon_host", srcPool.getSourceHost() + ":" + srcPool.getSourcePort());
736+
r.confSet("key", srcPool.getAuthSecret());
737+
r.connect();
738+
s_logger.debug("Succesfully connected to Ceph cluster at " + r.confGet("mon_host"));
739+
740+
IoCTX io = r.ioCtxCreate(srcPool.getSourceDir());
741+
Rbd rbd = new Rbd(io);
742+
RbdImage srcImage = rbd.open(template.getName());
743+
744+
if (srcImage.isOldFormat()) {
745+
/* The source image is RBD format 1, we have to do a regular copy */
746+
s_logger.debug("The source image " + srcPool.getSourceDir() + "/" + template.getName()
747+
+ " is RBD format 1. We have to perform a regular copy (" + template.getVirtualSize() + " bytes)");
748+
749+
rbd.create(disk.getName(), template.getVirtualSize(), rbdFeatures, rbdOrder);
750+
RbdImage destImage = rbd.open(disk.getName());
751+
752+
s_logger.debug("Starting to copy " + srcImage.getName() + " to " + destImage.getName() + " in Ceph pool " + srcPool.getSourceDir());
753+
rbd.copy(srcImage, destImage);
754+
755+
s_logger.debug("Finished copying " + srcImage.getName() + " to " + destImage.getName() + " in Ceph pool " + srcPool.getSourceDir());
756+
rbd.close(destImage);
757+
} else {
758+
s_logger.debug("The source image " + srcPool.getSourceDir() + "/" + template.getName()
759+
+ " is RBD format 2. We will perform a RBD clone using snapshot "
760+
+ this.rbdTemplateSnapName);
761+
/* The source image is format 2, we can do a RBD snapshot+clone (layering) */
762+
rbd.clone(template.getName(), this.rbdTemplateSnapName, io, disk.getName(), rbdFeatures, rbdOrder);
763+
s_logger.debug("Succesfully cloned " + template.getName() + "@" + this.rbdTemplateSnapName + " to " + disk.getName());
764+
}
765+
766+
rbd.close(srcImage);
767+
r.ioCtxDestroy(io);
768+
} else {
769+
/* The source pool or host is not the same Ceph cluster, we do a simple copy with Qemu-Img */
770+
s_logger.debug("Both the source and destination are RBD, but not the same Ceph cluster. Performing a copy");
771+
772+
Rados rSrc = new Rados(srcPool.getAuthUserName());
773+
rSrc.confSet("mon_host", srcPool.getSourceHost() + ":" + srcPool.getSourcePort());
774+
rSrc.confSet("key", srcPool.getAuthSecret());
775+
rSrc.connect();
776+
s_logger.debug("Succesfully connected to source Ceph cluster at " + rSrc.confGet("mon_host"));
777+
778+
Rados rDest = new Rados(destPool.getAuthUserName());
779+
rDest.confSet("mon_host", destPool.getSourceHost() + ":" + destPool.getSourcePort());
780+
rDest.confSet("key", destPool.getAuthSecret());
781+
rDest.connect();
782+
s_logger.debug("Succesfully connected to source Ceph cluster at " + rDest.confGet("mon_host"));
783+
784+
IoCTX sIO = rSrc.ioCtxCreate(srcPool.getSourceDir());
785+
Rbd sRbd = new Rbd(sIO);
786+
787+
IoCTX dIO = rDest.ioCtxCreate(destPool.getSourceDir());
788+
Rbd dRbd = new Rbd(dIO);
789+
790+
s_logger.debug("Creating " + disk.getName() + " on the destination cluster " + rDest.confGet("mon_host")
791+
+ " in pool " + destPool.getSourceDir());
792+
dRbd.create(disk.getName(), template.getVirtualSize(), rbdFeatures, rbdOrder);
793+
794+
RbdImage srcImage = sRbd.open(template.getName());
795+
RbdImage destImage = dRbd.open(disk.getName());
796+
797+
s_logger.debug("Copying " + template.getName() + " from Ceph cluster " + rSrc.confGet("mon_host") + " to " + disk.getName()
798+
+ " on cluster " + rDest.confGet("mon_host"));
799+
sRbd.copy(srcImage, destImage);
800+
801+
sRbd.close(srcImage);
802+
dRbd.close(destImage);
803+
804+
rSrc.ioCtxDestroy(sIO);
805+
rDest.ioCtxDestroy(dIO);
806+
}
807+
} catch (RadosException e) {
808+
s_logger.error("Failed to perform a RADOS action on the Ceph cluster, the error was: " + e.getMessage());
809+
disk = null;
810+
} catch (RbdException e) {
811+
s_logger.error("Failed to perform a RBD action on the Ceph cluster, the error was: " + e.getMessage());
812+
disk = null;
813+
}
701814
}
702-
qemu.convert(srcFile, destFile);
703815
}
704816
} catch (QemuImgException e) {
705817
s_logger.error("Failed to create " + disk.getPath() +
706818
" due to a failed executing of qemu-img: " + e.getMessage());
707819
}
820+
821+
if (disk == null) {
822+
throw new CloudRuntimeException("Failed to create " + disk.getPath() + " from template " + template.getName());
823+
}
824+
708825
return disk;
709826
}
710827

@@ -733,17 +850,26 @@ public List<KVMPhysicalDisk> listPhysicalDisks(String storagePoolUuid,
733850
}
734851
}
735852

853+
/**
854+
* This copies a volume from Primary Storage to Secondary Storage
855+
*
856+
* In theory it could also do it the other way around, but the current implementation
857+
* in ManagementServerImpl shows that the destPool is always a Secondary Storage Pool
858+
*/
736859
@Override
737860
public KVMPhysicalDisk copyPhysicalDisk(KVMPhysicalDisk disk, String name,
738861
KVMStoragePool destPool) {
739862

740-
/*
863+
/**
741864
With RBD you can't run qemu-img convert with an existing RBD image as destination
742865
qemu-img will exit with the error that the destination already exists.
743866
So for RBD we don't create the image, but let qemu-img do that for us.
744867
745868
We then create a KVMPhysicalDisk object that we can return
746-
*/
869+
870+
It is however very unlikely that the destPool will be RBD, since it isn't supported
871+
for Secondary Storage
872+
*/
747873

748874
KVMPhysicalDisk newDisk;
749875
if (destPool.getType() != StoragePoolType.RBD) {
@@ -791,15 +917,97 @@ public KVMPhysicalDisk copyPhysicalDisk(KVMPhysicalDisk disk, String name,
791917
+ srcFile.getFileName() + " the error was: " + e.getMessage());
792918
}
793919
}
920+
921+
try {
922+
qemu.convert(srcFile, destFile);
923+
} catch (QemuImgException e) {
924+
s_logger.error("Failed to convert " + srcFile.getFileName() + " to "
925+
+ destFile.getFileName() + " the error was: " + e.getMessage());
926+
}
927+
794928
} else if ((srcPool.getType() != StoragePoolType.RBD) && (destPool.getType() == StoragePoolType.RBD)) {
795-
srcFile = new QemuImgFile(sourcePath, sourceFormat);
796-
destFile = new QemuImgFile(KVMPhysicalDisk.RBDStringBuilder(destPool.getSourceHost(),
797-
destPool.getSourcePort(),
798-
destPool.getAuthUserName(),
799-
destPool.getAuthSecret(),
800-
destPath));
801-
destFile.setFormat(destFormat);
929+
/**
930+
* Qemu doesn't support writing to RBD format 2 directly, so we have to write to a temporary RAW file first
931+
* which we then convert to RBD format 2.
932+
*
933+
* A HUGE performance gain can be achieved here if QCOW2 -> RBD format 2 can be done in one step
934+
*/
935+
s_logger.debug("The source image is not RBD, but the destination is. We will convert into RBD format 2");
936+
String tmpFile = "/tmp/" + name;
937+
int rbdFeatures = (1<<0);
938+
int rbdOrder = 0;
939+
940+
try {
941+
srcFile = new QemuImgFile(sourcePath, sourceFormat);
942+
destFile = new QemuImgFile(tmpFile);
943+
s_logger.debug("Converting " + srcFile.getFileName() + " to " + tmpFile + " as a temporary file for RBD conversion");
944+
qemu.convert(srcFile, destFile);
945+
946+
// We now convert the temporary file to a RBD image with format 2
947+
Rados r = new Rados(destPool.getAuthUserName());
948+
r.confSet("mon_host", destPool.getSourceHost() + ":" + destPool.getSourcePort());
949+
r.confSet("key", destPool.getAuthSecret());
950+
r.connect();
951+
s_logger.debug("Succesfully connected to Ceph cluster at " + r.confGet("mon_host"));
952+
953+
IoCTX io = r.ioCtxCreate(destPool.getSourceDir());
954+
Rbd rbd = new Rbd(io);
955+
956+
s_logger.debug("Creating RBD image " + name + " in Ceph pool " + destPool.getSourceDir() + " with RBD format 2");
957+
rbd.create(name, disk.getVirtualSize(), rbdFeatures, rbdOrder);
958+
959+
RbdImage image = rbd.open(name);
960+
961+
// We now read the temporary file and write it to the RBD image
962+
File fh = new File(tmpFile);
963+
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fh));
964+
965+
int chunkSize = 4194304;
966+
long offset = 0;
967+
s_logger.debug("Reading temporary file " + tmpFile + " (" + fh.length() + " bytes) into RBD image " + name + " in chunks of " + chunkSize + " bytes");
968+
while(true) {
969+
byte[] buf = new byte[chunkSize];
970+
971+
int bytes = bis.read(buf);
972+
if (bytes <= 0) {
973+
break;
974+
}
975+
image.write(buf, offset, bytes);
976+
offset += bytes;
977+
}
978+
s_logger.debug("Completed writing " + tmpFile + " to RBD image " + name + ". Bytes written: " + offset);
979+
bis.close();
980+
s_logger.debug("Removing temporary file " + tmpFile);
981+
fh.delete();
982+
983+
/* Snapshot the image and protect that snapshot so we can clone (layer) from it */
984+
s_logger.debug("Creating RBD snapshot " + this.rbdTemplateSnapName + " on image " + name);
985+
image.snapCreate(this.rbdTemplateSnapName);
986+
s_logger.debug("Protecting RBD snapshot " + this.rbdTemplateSnapName + " on image " + name);
987+
image.snapProtect(this.rbdTemplateSnapName);
988+
989+
rbd.close(image);
990+
r.ioCtxDestroy(io);
991+
} catch (QemuImgException e) {
992+
s_logger.error("Failed to do a temp convert from " + srcFile.getFileName() + " to "
993+
+ destFile.getFileName() + " the error was: " + e.getMessage());
994+
newDisk = null;
995+
} catch (RadosException e) {
996+
s_logger.error("A Ceph RADOS operation failed (" + e.getReturnValue() + "). The error was: " + e.getMessage());
997+
newDisk = null;
998+
} catch (RbdException e) {
999+
s_logger.error("A Ceph RBD operation failed (" + e.getReturnValue() + "). The error was: " + e.getMessage());
1000+
newDisk = null;
1001+
} catch (IOException e) {
1002+
s_logger.error("Failed reading the temporary file during the conversion to RBD: " + e.getMessage());
1003+
newDisk = null;
1004+
}
1005+
8021006
} else {
1007+
/**
1008+
We let Qemu-Img do the work here. Although we could work with librbd and have that do the cloning
1009+
it doesn't benefit us. It's better to keep the current code in place which works
1010+
*/
8031011
srcFile = new QemuImgFile(KVMPhysicalDisk.RBDStringBuilder(srcPool.getSourceHost(),
8041012
srcPool.getSourcePort(),
8051013
srcPool.getAuthUserName(),
@@ -812,17 +1020,19 @@ public KVMPhysicalDisk copyPhysicalDisk(KVMPhysicalDisk disk, String name,
8121020
destPool.getAuthSecret(),
8131021
destPath));
8141022
destFile.setFormat(destFormat);
815-
}
8161023

817-
if (srcFile != null && destFile != null) {
8181024
try {
8191025
qemu.convert(srcFile, destFile);
8201026
} catch (QemuImgException e) {
8211027
s_logger.error("Failed to convert " + srcFile.getFileName() + " to "
8221028
+ destFile.getFileName() + " the error was: " + e.getMessage());
1029+
newDisk = null;
8231030
}
8241031
}
8251032

1033+
if (newDisk == null) {
1034+
throw new CloudRuntimeException("Failed to copy " + disk.getPath() + " to " + name);
1035+
}
8261036

8271037
return newDisk;
8281038
}

pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
<cs.java-ipv6.version>0.10</cs.java-ipv6.version>
8484
<cs.replace.properties>build/replace.properties</cs.replace.properties>
8585
<cs.libvirt-java.version>0.4.9</cs.libvirt-java.version>
86+
<cs.rados-java.version>0.1.1</cs.rados-java.version>
8687
<cs.target.dir>target</cs.target.dir>
8788
<cs.daemon.version>1.0.10</cs.daemon.version>
8889
</properties>

0 commit comments

Comments
 (0)