diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 09c184e41cf8..32c2e9da5f3a 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -433,6 +433,12 @@ tcp_workaround_signed_windows - BOOLEAN not receive a window scaling option from them. Default: 0 +tcp_dma_copybreak - INTEGER + Lower limit, in bytes, of the size of socket reads that will be + offloaded to a DMA copy engine, if one is present in the system + and CONFIG_NET_DMA is enabled. + Default: 4096 + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b53e1d4bc486..a44c6da9bf83 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1042,6 +1042,8 @@ source "drivers/mmc/Kconfig" source "drivers/rtc/Kconfig" +source "drivers/dma/Kconfig" + endmenu source "fs/Kconfig" diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c index bc4871553f6a..bfe0c87e3397 100644 --- a/arch/arm/mach-iop13xx/setup.c +++ b/arch/arm/mach-iop13xx/setup.c @@ -25,6 +25,7 @@ #include #include #include +#include #define IOP13XX_UART_XTAL 33334000 #define IOP13XX_SETUP_DEBUG 0 @@ -236,19 +237,143 @@ static unsigned long iq8134x_probe_flash_size(void) } #endif +/* ADMA Channels */ +static struct resource iop13xx_adma_0_resources[] = { + [0] = { + .start = IOP13XX_ADMA_PHYS_BASE(0), + .end = IOP13XX_ADMA_UPPER_PA(0), + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_IOP13XX_ADMA0_EOT, + .end = IRQ_IOP13XX_ADMA0_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_IOP13XX_ADMA0_EOC, + .end = IRQ_IOP13XX_ADMA0_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_IOP13XX_ADMA0_ERR, + .end = IRQ_IOP13XX_ADMA0_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static struct resource iop13xx_adma_1_resources[] = { + [0] = { + .start = IOP13XX_ADMA_PHYS_BASE(1), + .end = IOP13XX_ADMA_UPPER_PA(1), + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_IOP13XX_ADMA1_EOT, + .end = IRQ_IOP13XX_ADMA1_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_IOP13XX_ADMA1_EOC, + .end = IRQ_IOP13XX_ADMA1_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_IOP13XX_ADMA1_ERR, + .end = IRQ_IOP13XX_ADMA1_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static struct resource iop13xx_adma_2_resources[] = { + [0] = { + .start = IOP13XX_ADMA_PHYS_BASE(2), + .end = IOP13XX_ADMA_UPPER_PA(2), + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_IOP13XX_ADMA2_EOT, + .end = IRQ_IOP13XX_ADMA2_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_IOP13XX_ADMA2_EOC, + .end = IRQ_IOP13XX_ADMA2_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_IOP13XX_ADMA2_ERR, + .end = IRQ_IOP13XX_ADMA2_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static u64 iop13xx_adma_dmamask = DMA_64BIT_MASK; +static struct iop_adma_platform_data iop13xx_adma_0_data = { + .hw_id = 0, + .pool_size = PAGE_SIZE, +}; + +static struct iop_adma_platform_data iop13xx_adma_1_data = { + .hw_id = 1, + .pool_size = PAGE_SIZE, +}; + +static struct iop_adma_platform_data iop13xx_adma_2_data = { + .hw_id = 2, + .pool_size = PAGE_SIZE, +}; + +/* The ids are fixed up later in iop13xx_platform_init */ +static struct platform_device iop13xx_adma_0_channel = { + .name = "iop-adma", + .id = 0, + .num_resources = 4, + .resource = iop13xx_adma_0_resources, + .dev = { + .dma_mask = &iop13xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop13xx_adma_0_data, + }, +}; + +static struct platform_device iop13xx_adma_1_channel = { + .name = "iop-adma", + .id = 0, + .num_resources = 4, + .resource = iop13xx_adma_1_resources, + .dev = { + .dma_mask = &iop13xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop13xx_adma_1_data, + }, +}; + +static struct platform_device iop13xx_adma_2_channel = { + .name = "iop-adma", + .id = 0, + .num_resources = 4, + .resource = iop13xx_adma_2_resources, + .dev = { + .dma_mask = &iop13xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop13xx_adma_2_data, + }, +}; + void __init iop13xx_map_io(void) { /* Initialize the Static Page Table maps */ iotable_init(iop13xx_std_desc, ARRAY_SIZE(iop13xx_std_desc)); } -static int init_uart = 0; -static int init_i2c = 0; +static int init_uart; +static int init_i2c; +static int init_adma; void __init iop13xx_platform_init(void) { int i; - u32 uart_idx, i2c_idx, plat_idx; + u32 uart_idx, i2c_idx, adma_idx, plat_idx; struct platform_device *iop13xx_devices[IQ81340_MAX_PLAT_DEVICES]; /* set the bases so we can read the device id */ @@ -294,6 +419,12 @@ void __init iop13xx_platform_init(void) } } + if (init_adma == IOP13XX_INIT_ADMA_DEFAULT) { + init_adma |= IOP13XX_INIT_ADMA_0; + init_adma |= IOP13XX_INIT_ADMA_1; + init_adma |= IOP13XX_INIT_ADMA_2; + } + plat_idx = 0; uart_idx = 0; i2c_idx = 0; @@ -332,6 +463,56 @@ void __init iop13xx_platform_init(void) } } + /* initialize adma channel ids and capabilities */ + adma_idx = 0; + for (i = 0; i < IQ81340_NUM_ADMA; i++) { + struct iop_adma_platform_data *plat_data; + if ((init_adma & (1 << i)) && IOP13XX_SETUP_DEBUG) + printk(KERN_INFO + "Adding adma%d to platform device list\n", i); + switch (init_adma & (1 << i)) { + case IOP13XX_INIT_ADMA_0: + iop13xx_adma_0_channel.id = adma_idx++; + iop13xx_devices[plat_idx++] = &iop13xx_adma_0_channel; + plat_data = &iop13xx_adma_0_data; + dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); + dma_cap_set(DMA_XOR, plat_data->cap_mask); + dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); + dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask); + dma_cap_set(DMA_MEMSET, plat_data->cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); + dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); + break; + case IOP13XX_INIT_ADMA_1: + iop13xx_adma_1_channel.id = adma_idx++; + iop13xx_devices[plat_idx++] = &iop13xx_adma_1_channel; + plat_data = &iop13xx_adma_1_data; + dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); + dma_cap_set(DMA_XOR, plat_data->cap_mask); + dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); + dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask); + dma_cap_set(DMA_MEMSET, plat_data->cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); + dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); + break; + case IOP13XX_INIT_ADMA_2: + iop13xx_adma_2_channel.id = adma_idx++; + iop13xx_devices[plat_idx++] = &iop13xx_adma_2_channel; + plat_data = &iop13xx_adma_2_data; + dma_cap_set(DMA_MEMCPY, plat_data->cap_mask); + dma_cap_set(DMA_XOR, plat_data->cap_mask); + dma_cap_set(DMA_DUAL_XOR, plat_data->cap_mask); + dma_cap_set(DMA_ZERO_SUM, plat_data->cap_mask); + dma_cap_set(DMA_MEMSET, plat_data->cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, plat_data->cap_mask); + dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask); + dma_cap_set(DMA_PQ_XOR, plat_data->cap_mask); + dma_cap_set(DMA_PQ_UPDATE, plat_data->cap_mask); + dma_cap_set(DMA_PQ_ZERO_SUM, plat_data->cap_mask); + break; + } + } + #ifdef CONFIG_MTD_PHYSMAP iq8134x_flash_resource.end = iq8134x_flash_resource.start + iq8134x_probe_flash_size() - 1; @@ -399,5 +580,35 @@ static int __init iop13xx_init_i2c_setup(char *str) return 1; } +static int __init iop13xx_init_adma_setup(char *str) +{ + if (str) { + while (*str != '\0') { + switch (*str) { + case '0': + init_adma |= IOP13XX_INIT_ADMA_0; + break; + case '1': + init_adma |= IOP13XX_INIT_ADMA_1; + break; + case '2': + init_adma |= IOP13XX_INIT_ADMA_2; + break; + case ',': + case '=': + break; + default: + PRINTK("\"iop13xx_init_adma\" malformed" + " at character: \'%c\'", *str); + *(str + 1) = '\0'; + init_adma = IOP13XX_INIT_ADMA_DEFAULT; + } + str++; + } + } + return 1; +} + +__setup("iop13xx_init_adma", iop13xx_init_adma_setup); __setup("iop13xx_init_uart", iop13xx_init_uart_setup); __setup("iop13xx_init_i2c", iop13xx_init_i2c_setup); diff --git a/arch/arm/mach-iop32x/glantank.c b/arch/arm/mach-iop32x/glantank.c index 5776fd884115..2b086ab2668c 100644 --- a/arch/arm/mach-iop32x/glantank.c +++ b/arch/arm/mach-iop32x/glantank.c @@ -180,6 +180,8 @@ static void __init glantank_init_machine(void) platform_device_register(&iop3xx_i2c1_device); platform_device_register(&glantank_flash_device); platform_device_register(&glantank_serial_device); + platform_device_register(&iop3xx_dma_0_channel); + platform_device_register(&iop3xx_dma_1_channel); pm_power_off = glantank_power_off; } diff --git a/arch/arm/mach-iop32x/iq31244.c b/arch/arm/mach-iop32x/iq31244.c index d4eefbea1fe6..98cfa1cd6bdb 100644 --- a/arch/arm/mach-iop32x/iq31244.c +++ b/arch/arm/mach-iop32x/iq31244.c @@ -298,9 +298,14 @@ static void __init iq31244_init_machine(void) platform_device_register(&iop3xx_i2c1_device); platform_device_register(&iq31244_flash_device); platform_device_register(&iq31244_serial_device); + platform_device_register(&iop3xx_dma_0_channel); + platform_device_register(&iop3xx_dma_1_channel); if (is_ep80219()) pm_power_off = ep80219_power_off; + + if (!is_80219()) + platform_device_register(&iop3xx_aau_channel); } static int __init force_ep80219_setup(char *str) diff --git a/arch/arm/mach-iop32x/iq80321.c b/arch/arm/mach-iop32x/iq80321.c index 8d9f49164a84..18ad29f213b2 100644 --- a/arch/arm/mach-iop32x/iq80321.c +++ b/arch/arm/mach-iop32x/iq80321.c @@ -181,6 +181,9 @@ static void __init iq80321_init_machine(void) platform_device_register(&iop3xx_i2c1_device); platform_device_register(&iq80321_flash_device); platform_device_register(&iq80321_serial_device); + platform_device_register(&iop3xx_dma_0_channel); + platform_device_register(&iop3xx_dma_1_channel); + platform_device_register(&iop3xx_aau_channel); } MACHINE_START(IQ80321, "Intel IQ80321") diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c index d55005d64781..390a97d39e5a 100644 --- a/arch/arm/mach-iop32x/n2100.c +++ b/arch/arm/mach-iop32x/n2100.c @@ -245,6 +245,8 @@ static void __init n2100_init_machine(void) platform_device_register(&iop3xx_i2c0_device); platform_device_register(&n2100_flash_device); platform_device_register(&n2100_serial_device); + platform_device_register(&iop3xx_dma_0_channel); + platform_device_register(&iop3xx_dma_1_channel); pm_power_off = n2100_power_off; diff --git a/arch/arm/mach-iop33x/iq80331.c b/arch/arm/mach-iop33x/iq80331.c index 2b063180687a..433188ebff2a 100644 --- a/arch/arm/mach-iop33x/iq80331.c +++ b/arch/arm/mach-iop33x/iq80331.c @@ -136,6 +136,9 @@ static void __init iq80331_init_machine(void) platform_device_register(&iop33x_uart0_device); platform_device_register(&iop33x_uart1_device); platform_device_register(&iq80331_flash_device); + platform_device_register(&iop3xx_dma_0_channel); + platform_device_register(&iop3xx_dma_1_channel); + platform_device_register(&iop3xx_aau_channel); } MACHINE_START(IQ80331, "Intel IQ80331") diff --git a/arch/arm/mach-iop33x/iq80332.c b/arch/arm/mach-iop33x/iq80332.c index 7889ce3cb08e..416c09564cc6 100644 --- a/arch/arm/mach-iop33x/iq80332.c +++ b/arch/arm/mach-iop33x/iq80332.c @@ -136,6 +136,9 @@ static void __init iq80332_init_machine(void) platform_device_register(&iop33x_uart0_device); platform_device_register(&iop33x_uart1_device); platform_device_register(&iq80332_flash_device); + platform_device_register(&iop3xx_dma_0_channel); + platform_device_register(&iop3xx_dma_1_channel); + platform_device_register(&iop3xx_aau_channel); } MACHINE_START(IQ80332, "Intel IQ80332") diff --git a/arch/arm/plat-iop/Makefile b/arch/arm/plat-iop/Makefile index 4d2b1da3cd82..36bff0325959 100644 --- a/arch/arm/plat-iop/Makefile +++ b/arch/arm/plat-iop/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_ARCH_IOP32X) += setup.o obj-$(CONFIG_ARCH_IOP32X) += time.o obj-$(CONFIG_ARCH_IOP32X) += io.o obj-$(CONFIG_ARCH_IOP32X) += cp6.o +obj-$(CONFIG_ARCH_IOP32X) += adma.o # IOP33X obj-$(CONFIG_ARCH_IOP33X) += gpio.o @@ -21,6 +22,7 @@ obj-$(CONFIG_ARCH_IOP33X) += setup.o obj-$(CONFIG_ARCH_IOP33X) += time.o obj-$(CONFIG_ARCH_IOP33X) += io.o obj-$(CONFIG_ARCH_IOP33X) += cp6.o +obj-$(CONFIG_ARCH_IOP33X) += adma.o # IOP13XX obj-$(CONFIG_ARCH_IOP13XX) += cp6.o diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c new file mode 100644 index 000000000000..53c5e9a52eb1 --- /dev/null +++ b/arch/arm/plat-iop/adma.c @@ -0,0 +1,209 @@ +/* + * platform device definitions for the iop3xx dma/xor engines + * Copyright © 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include +#include +#include +#include + +#ifdef CONFIG_ARCH_IOP32X +#define IRQ_DMA0_EOT IRQ_IOP32X_DMA0_EOT +#define IRQ_DMA0_EOC IRQ_IOP32X_DMA0_EOC +#define IRQ_DMA0_ERR IRQ_IOP32X_DMA0_ERR + +#define IRQ_DMA1_EOT IRQ_IOP32X_DMA1_EOT +#define IRQ_DMA1_EOC IRQ_IOP32X_DMA1_EOC +#define IRQ_DMA1_ERR IRQ_IOP32X_DMA1_ERR + +#define IRQ_AA_EOT IRQ_IOP32X_AA_EOT +#define IRQ_AA_EOC IRQ_IOP32X_AA_EOC +#define IRQ_AA_ERR IRQ_IOP32X_AA_ERR +#endif +#ifdef CONFIG_ARCH_IOP33X +#define IRQ_DMA0_EOT IRQ_IOP33X_DMA0_EOT +#define IRQ_DMA0_EOC IRQ_IOP33X_DMA0_EOC +#define IRQ_DMA0_ERR IRQ_IOP33X_DMA0_ERR + +#define IRQ_DMA1_EOT IRQ_IOP33X_DMA1_EOT +#define IRQ_DMA1_EOC IRQ_IOP33X_DMA1_EOC +#define IRQ_DMA1_ERR IRQ_IOP33X_DMA1_ERR + +#define IRQ_AA_EOT IRQ_IOP33X_AA_EOT +#define IRQ_AA_EOC IRQ_IOP33X_AA_EOC +#define IRQ_AA_ERR IRQ_IOP33X_AA_ERR +#endif +/* AAU and DMA Channels */ +static struct resource iop3xx_dma_0_resources[] = { + [0] = { + .start = IOP3XX_DMA_PHYS_BASE(0), + .end = IOP3XX_DMA_UPPER_PA(0), + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_DMA0_EOT, + .end = IRQ_DMA0_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_DMA0_EOC, + .end = IRQ_DMA0_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_DMA0_ERR, + .end = IRQ_DMA0_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static struct resource iop3xx_dma_1_resources[] = { + [0] = { + .start = IOP3XX_DMA_PHYS_BASE(1), + .end = IOP3XX_DMA_UPPER_PA(1), + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_DMA1_EOT, + .end = IRQ_DMA1_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_DMA1_EOC, + .end = IRQ_DMA1_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_DMA1_ERR, + .end = IRQ_DMA1_ERR, + .flags = IORESOURCE_IRQ + } +}; + + +static struct resource iop3xx_aau_resources[] = { + [0] = { + .start = IOP3XX_AAU_PHYS_BASE, + .end = IOP3XX_AAU_UPPER_PA, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_AA_EOT, + .end = IRQ_AA_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_AA_EOC, + .end = IRQ_AA_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_AA_ERR, + .end = IRQ_AA_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static u64 iop3xx_adma_dmamask = DMA_32BIT_MASK; + +static struct iop_adma_platform_data iop3xx_dma_0_data = { + .hw_id = DMA0_ID, + .pool_size = PAGE_SIZE, +}; + +static struct iop_adma_platform_data iop3xx_dma_1_data = { + .hw_id = DMA1_ID, + .pool_size = PAGE_SIZE, +}; + +static struct iop_adma_platform_data iop3xx_aau_data = { + .hw_id = AAU_ID, + .pool_size = 3 * PAGE_SIZE, +}; + +struct platform_device iop3xx_dma_0_channel = { + .name = "iop-adma", + .id = 0, + .num_resources = 4, + .resource = iop3xx_dma_0_resources, + .dev = { + .dma_mask = &iop3xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop3xx_dma_0_data, + }, +}; + +struct platform_device iop3xx_dma_1_channel = { + .name = "iop-adma", + .id = 1, + .num_resources = 4, + .resource = iop3xx_dma_1_resources, + .dev = { + .dma_mask = &iop3xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop3xx_dma_1_data, + }, +}; + +struct platform_device iop3xx_aau_channel = { + .name = "iop-adma", + .id = 2, + .num_resources = 4, + .resource = iop3xx_aau_resources, + .dev = { + .dma_mask = &iop3xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop3xx_aau_data, + }, +}; + +static int __init iop3xx_adma_cap_init(void) +{ + #ifdef CONFIG_ARCH_IOP32X /* the 32x DMA does not perform CRC32C */ + dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask); + dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask); + #else + dma_cap_set(DMA_MEMCPY, iop3xx_dma_0_data.cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_0_data.cap_mask); + dma_cap_set(DMA_INTERRUPT, iop3xx_dma_0_data.cap_mask); + #endif + + #ifdef CONFIG_ARCH_IOP32X /* the 32x DMA does not perform CRC32C */ + dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask); + dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask); + #else + dma_cap_set(DMA_MEMCPY, iop3xx_dma_1_data.cap_mask); + dma_cap_set(DMA_MEMCPY_CRC32C, iop3xx_dma_1_data.cap_mask); + dma_cap_set(DMA_INTERRUPT, iop3xx_dma_1_data.cap_mask); + #endif + + #ifdef CONFIG_ARCH_IOP32X /* the 32x AAU does not perform zero sum */ + dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask); + dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask); + dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask); + #else + dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask); + dma_cap_set(DMA_ZERO_SUM, iop3xx_aau_data.cap_mask); + dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask); + dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask); + #endif + + return 0; +} + +arch_initcall(iop3xx_adma_cap_init); diff --git a/crypto/Kconfig b/crypto/Kconfig index 4ca0ab3448d9..07090e9f9bcf 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1,7 +1,17 @@ +# +# Generic algorithms support +# +config XOR_BLOCKS + tristate + +# +# async_tx api: hardware offloaded memory transfer/transform support +# +source "crypto/async_tx/Kconfig" + # # Cryptographic API Configuration # - menu "Cryptographic options" config CRYPTO diff --git a/crypto/Makefile b/crypto/Makefile index cce46a1c9dc7..0cf17f1ea151 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -50,3 +50,9 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o + +# +# generic algorithms and the async_tx api +# +obj-$(CONFIG_XOR_BLOCKS) += xor.o +obj-$(CONFIG_ASYNC_CORE) += async_tx/ diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig new file mode 100644 index 000000000000..d8fb39145986 --- /dev/null +++ b/crypto/async_tx/Kconfig @@ -0,0 +1,16 @@ +config ASYNC_CORE + tristate + +config ASYNC_MEMCPY + tristate + select ASYNC_CORE + +config ASYNC_XOR + tristate + select ASYNC_CORE + select XOR_BLOCKS + +config ASYNC_MEMSET + tristate + select ASYNC_CORE + diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile new file mode 100644 index 000000000000..27baa7d52fbc --- /dev/null +++ b/crypto/async_tx/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_ASYNC_CORE) += async_tx.o +obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o +obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o +obj-$(CONFIG_ASYNC_XOR) += async_xor.o diff --git a/crypto/async_tx/async_memcpy.c b/crypto/async_tx/async_memcpy.c new file mode 100644 index 000000000000..a973f4ef897d --- /dev/null +++ b/crypto/async_tx/async_memcpy.c @@ -0,0 +1,131 @@ +/* + * copy offload engine support + * + * Copyright © 2006, Intel Corporation. + * + * Dan Williams + * + * with architecture considerations by: + * Neil Brown + * Jeff Garzik + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include +#include +#include +#include + +/** + * async_memcpy - attempt to copy memory with a dma engine. + * @dest: destination page + * @src: src page + * @offset: offset in pages to start transaction + * @len: length in bytes + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK, + * ASYNC_TX_KMAP_SRC, ASYNC_TX_KMAP_DST + * @depend_tx: memcpy depends on the result of this transaction + * @cb_fn: function to call when the memcpy completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, + unsigned int src_offset, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMCPY); + struct dma_device *device = chan ? chan->device : NULL; + int int_en = cb_fn ? 1 : 0; + struct dma_async_tx_descriptor *tx = device ? + device->device_prep_dma_memcpy(chan, len, + int_en) : NULL; + + if (tx) { /* run the memcpy asynchronously */ + dma_addr_t addr; + enum dma_data_direction dir; + + pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len); + + dir = (flags & ASYNC_TX_ASSUME_COHERENT) ? + DMA_NONE : DMA_FROM_DEVICE; + + addr = dma_map_page(device->dev, dest, dest_offset, len, dir); + tx->tx_set_dest(addr, tx, 0); + + dir = (flags & ASYNC_TX_ASSUME_COHERENT) ? + DMA_NONE : DMA_TO_DEVICE; + + addr = dma_map_page(device->dev, src, src_offset, len, dir); + tx->tx_set_src(addr, tx, 0); + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + } else { /* run the memcpy synchronously */ + void *dest_buf, *src_buf; + pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len); + + /* wait for any prerequisite operations */ + if (depend_tx) { + /* if ack is already set then we cannot be sure + * we are referring to the correct operation + */ + BUG_ON(depend_tx->ack); + if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) + panic("%s: DMA_ERROR waiting for depend_tx\n", + __FUNCTION__); + } + + if (flags & ASYNC_TX_KMAP_DST) + dest_buf = kmap_atomic(dest, KM_USER0) + dest_offset; + else + dest_buf = page_address(dest) + dest_offset; + + if (flags & ASYNC_TX_KMAP_SRC) + src_buf = kmap_atomic(src, KM_USER0) + src_offset; + else + src_buf = page_address(src) + src_offset; + + memcpy(dest_buf, src_buf, len); + + if (flags & ASYNC_TX_KMAP_DST) + kunmap_atomic(dest_buf, KM_USER0); + + if (flags & ASYNC_TX_KMAP_SRC) + kunmap_atomic(src_buf, KM_USER0); + + async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_memcpy); + +static int __init async_memcpy_init(void) +{ + return 0; +} + +static void __exit async_memcpy_exit(void) +{ + do { } while (0); +} + +module_init(async_memcpy_init); +module_exit(async_memcpy_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("asynchronous memcpy api"); +MODULE_LICENSE("GPL"); diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c new file mode 100644 index 000000000000..66ef6351202e --- /dev/null +++ b/crypto/async_tx/async_memset.c @@ -0,0 +1,109 @@ +/* + * memory fill offload engine support + * + * Copyright © 2006, Intel Corporation. + * + * Dan Williams + * + * with architecture considerations by: + * Neil Brown + * Jeff Garzik + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include +#include +#include +#include + +/** + * async_memset - attempt to fill memory with a dma engine. + * @dest: destination page + * @val: fill value + * @offset: offset in pages to start transaction + * @len: length in bytes + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: memset depends on the result of this transaction + * @cb_fn: function to call when the memcpy completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_memset(struct page *dest, int val, unsigned int offset, + size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_MEMSET); + struct dma_device *device = chan ? chan->device : NULL; + int int_en = cb_fn ? 1 : 0; + struct dma_async_tx_descriptor *tx = device ? + device->device_prep_dma_memset(chan, val, len, + int_en) : NULL; + + if (tx) { /* run the memset asynchronously */ + dma_addr_t dma_addr; + enum dma_data_direction dir; + + pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len); + dir = (flags & ASYNC_TX_ASSUME_COHERENT) ? + DMA_NONE : DMA_FROM_DEVICE; + + dma_addr = dma_map_page(device->dev, dest, offset, len, dir); + tx->tx_set_dest(dma_addr, tx, 0); + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + } else { /* run the memset synchronously */ + void *dest_buf; + pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len); + + dest_buf = (void *) (((char *) page_address(dest)) + offset); + + /* wait for any prerequisite operations */ + if (depend_tx) { + /* if ack is already set then we cannot be sure + * we are referring to the correct operation + */ + BUG_ON(depend_tx->ack); + if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) + panic("%s: DMA_ERROR waiting for depend_tx\n", + __FUNCTION__); + } + + memset(dest_buf, val, len); + + async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_memset); + +static int __init async_memset_init(void) +{ + return 0; +} + +static void __exit async_memset_exit(void) +{ + do { } while (0); +} + +module_init(async_memset_init); +module_exit(async_memset_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("asynchronous memset api"); +MODULE_LICENSE("GPL"); diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c new file mode 100644 index 000000000000..035007145e78 --- /dev/null +++ b/crypto/async_tx/async_tx.c @@ -0,0 +1,497 @@ +/* + * core routines for the asynchronous memory transfer/transform api + * + * Copyright © 2006, Intel Corporation. + * + * Dan Williams + * + * with architecture considerations by: + * Neil Brown + * Jeff Garzik + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include + +#ifdef CONFIG_DMA_ENGINE +static enum dma_state_client +dma_channel_add_remove(struct dma_client *client, + struct dma_chan *chan, enum dma_state state); + +static struct dma_client async_tx_dma = { + .event_callback = dma_channel_add_remove, + /* .cap_mask == 0 defaults to all channels */ +}; + +/** + * dma_cap_mask_all - enable iteration over all operation types + */ +static dma_cap_mask_t dma_cap_mask_all; + +/** + * chan_ref_percpu - tracks channel allocations per core/opertion + */ +struct chan_ref_percpu { + struct dma_chan_ref *ref; +}; + +static int channel_table_initialized; +static struct chan_ref_percpu *channel_table[DMA_TX_TYPE_END]; + +/** + * async_tx_lock - protect modification of async_tx_master_list and serialize + * rebalance operations + */ +static spinlock_t async_tx_lock; + +static struct list_head +async_tx_master_list = LIST_HEAD_INIT(async_tx_master_list); + +/* async_tx_issue_pending_all - start all transactions on all channels */ +void async_tx_issue_pending_all(void) +{ + struct dma_chan_ref *ref; + + rcu_read_lock(); + list_for_each_entry_rcu(ref, &async_tx_master_list, node) + ref->chan->device->device_issue_pending(ref->chan); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(async_tx_issue_pending_all); + +/* dma_wait_for_async_tx - spin wait for a transcation to complete + * @tx: transaction to wait on + */ +enum dma_status +dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) +{ + enum dma_status status; + struct dma_async_tx_descriptor *iter; + + if (!tx) + return DMA_SUCCESS; + + /* poll through the dependency chain, return when tx is complete */ + do { + iter = tx; + while (iter->cookie == -EBUSY) + iter = iter->parent; + + status = dma_sync_wait(iter->chan, iter->cookie); + } while (status == DMA_IN_PROGRESS || (iter != tx)); + + return status; +} +EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); + +/* async_tx_run_dependencies - helper routine for dma drivers to process + * (start) dependent operations on their target channel + * @tx: transaction with dependencies + */ +void +async_tx_run_dependencies(struct dma_async_tx_descriptor *tx) +{ + struct dma_async_tx_descriptor *dep_tx, *_dep_tx; + struct dma_device *dev; + struct dma_chan *chan; + + list_for_each_entry_safe(dep_tx, _dep_tx, &tx->depend_list, + depend_node) { + chan = dep_tx->chan; + dev = chan->device; + /* we can't depend on ourselves */ + BUG_ON(chan == tx->chan); + list_del(&dep_tx->depend_node); + tx->tx_submit(dep_tx); + + /* we need to poke the engine as client code does not + * know about dependency submission events + */ + dev->device_issue_pending(chan); + } +} +EXPORT_SYMBOL_GPL(async_tx_run_dependencies); + +static void +free_dma_chan_ref(struct rcu_head *rcu) +{ + struct dma_chan_ref *ref; + ref = container_of(rcu, struct dma_chan_ref, rcu); + kfree(ref); +} + +static void +init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan) +{ + INIT_LIST_HEAD(&ref->node); + INIT_RCU_HEAD(&ref->rcu); + ref->chan = chan; + atomic_set(&ref->count, 0); +} + +/** + * get_chan_ref_by_cap - returns the nth channel of the given capability + * defaults to returning the channel with the desired capability and the + * lowest reference count if the index can not be satisfied + * @cap: capability to match + * @index: nth channel desired, passing -1 has the effect of forcing the + * default return value + */ +static struct dma_chan_ref * +get_chan_ref_by_cap(enum dma_transaction_type cap, int index) +{ + struct dma_chan_ref *ret_ref = NULL, *min_ref = NULL, *ref; + + rcu_read_lock(); + list_for_each_entry_rcu(ref, &async_tx_master_list, node) + if (dma_has_cap(cap, ref->chan->device->cap_mask)) { + if (!min_ref) + min_ref = ref; + else if (atomic_read(&ref->count) < + atomic_read(&min_ref->count)) + min_ref = ref; + + if (index-- == 0) { + ret_ref = ref; + break; + } + } + rcu_read_unlock(); + + if (!ret_ref) + ret_ref = min_ref; + + if (ret_ref) + atomic_inc(&ret_ref->count); + + return ret_ref; +} + +/** + * async_tx_rebalance - redistribute the available channels, optimize + * for cpu isolation in the SMP case, and opertaion isolation in the + * uniprocessor case + */ +static void async_tx_rebalance(void) +{ + int cpu, cap, cpu_idx = 0; + unsigned long flags; + + if (!channel_table_initialized) + return; + + spin_lock_irqsave(&async_tx_lock, flags); + + /* undo the last distribution */ + for_each_dma_cap_mask(cap, dma_cap_mask_all) + for_each_possible_cpu(cpu) { + struct dma_chan_ref *ref = + per_cpu_ptr(channel_table[cap], cpu)->ref; + if (ref) { + atomic_set(&ref->count, 0); + per_cpu_ptr(channel_table[cap], cpu)->ref = + NULL; + } + } + + for_each_dma_cap_mask(cap, dma_cap_mask_all) + for_each_online_cpu(cpu) { + struct dma_chan_ref *new; + if (NR_CPUS > 1) + new = get_chan_ref_by_cap(cap, cpu_idx++); + else + new = get_chan_ref_by_cap(cap, -1); + + per_cpu_ptr(channel_table[cap], cpu)->ref = new; + } + + spin_unlock_irqrestore(&async_tx_lock, flags); +} + +static enum dma_state_client +dma_channel_add_remove(struct dma_client *client, + struct dma_chan *chan, enum dma_state state) +{ + unsigned long found, flags; + struct dma_chan_ref *master_ref, *ref; + enum dma_state_client ack = DMA_DUP; /* default: take no action */ + + switch (state) { + case DMA_RESOURCE_AVAILABLE: + found = 0; + rcu_read_lock(); + list_for_each_entry_rcu(ref, &async_tx_master_list, node) + if (ref->chan == chan) { + found = 1; + break; + } + rcu_read_unlock(); + + pr_debug("async_tx: dma resource available [%s]\n", + found ? "old" : "new"); + + if (!found) + ack = DMA_ACK; + else + break; + + /* add the channel to the generic management list */ + master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL); + if (master_ref) { + /* keep a reference until async_tx is unloaded */ + dma_chan_get(chan); + init_dma_chan_ref(master_ref, chan); + spin_lock_irqsave(&async_tx_lock, flags); + list_add_tail_rcu(&master_ref->node, + &async_tx_master_list); + spin_unlock_irqrestore(&async_tx_lock, + flags); + } else { + printk(KERN_WARNING "async_tx: unable to create" + " new master entry in response to" + " a DMA_RESOURCE_ADDED event" + " (-ENOMEM)\n"); + return 0; + } + + async_tx_rebalance(); + break; + case DMA_RESOURCE_REMOVED: + found = 0; + spin_lock_irqsave(&async_tx_lock, flags); + list_for_each_entry_rcu(ref, &async_tx_master_list, node) + if (ref->chan == chan) { + /* permit backing devices to go away */ + dma_chan_put(ref->chan); + list_del_rcu(&ref->node); + call_rcu(&ref->rcu, free_dma_chan_ref); + found = 1; + break; + } + spin_unlock_irqrestore(&async_tx_lock, flags); + + pr_debug("async_tx: dma resource removed [%s]\n", + found ? "ours" : "not ours"); + + if (found) + ack = DMA_ACK; + else + break; + + async_tx_rebalance(); + break; + case DMA_RESOURCE_SUSPEND: + case DMA_RESOURCE_RESUME: + printk(KERN_WARNING "async_tx: does not support dma channel" + " suspend/resume\n"); + break; + default: + BUG(); + } + + return ack; +} + +static int __init +async_tx_init(void) +{ + enum dma_transaction_type cap; + + spin_lock_init(&async_tx_lock); + bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END); + + /* an interrupt will never be an explicit operation type. + * clearing this bit prevents allocation to a slot in 'channel_table' + */ + clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits); + + for_each_dma_cap_mask(cap, dma_cap_mask_all) { + channel_table[cap] = alloc_percpu(struct chan_ref_percpu); + if (!channel_table[cap]) + goto err; + } + + channel_table_initialized = 1; + dma_async_client_register(&async_tx_dma); + dma_async_client_chan_request(&async_tx_dma); + + printk(KERN_INFO "async_tx: api initialized (async)\n"); + + return 0; +err: + printk(KERN_ERR "async_tx: initialization failure\n"); + + while (--cap >= 0) + free_percpu(channel_table[cap]); + + return 1; +} + +static void __exit async_tx_exit(void) +{ + enum dma_transaction_type cap; + + channel_table_initialized = 0; + + for_each_dma_cap_mask(cap, dma_cap_mask_all) + if (channel_table[cap]) + free_percpu(channel_table[cap]); + + dma_async_client_unregister(&async_tx_dma); +} + +/** + * async_tx_find_channel - find a channel to carry out the operation or let + * the transaction execute synchronously + * @depend_tx: transaction dependency + * @tx_type: transaction type + */ +struct dma_chan * +async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, + enum dma_transaction_type tx_type) +{ + /* see if we can keep the chain on one channel */ + if (depend_tx && + dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) + return depend_tx->chan; + else if (likely(channel_table_initialized)) { + struct dma_chan_ref *ref; + int cpu = get_cpu(); + ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref; + put_cpu(); + return ref ? ref->chan : NULL; + } else + return NULL; +} +EXPORT_SYMBOL_GPL(async_tx_find_channel); +#else +static int __init async_tx_init(void) +{ + printk(KERN_INFO "async_tx: api initialized (sync-only)\n"); + return 0; +} + +static void __exit async_tx_exit(void) +{ + do { } while (0); +} +#endif + +void +async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + tx->callback = cb_fn; + tx->callback_param = cb_param; + + /* set this new tx to run after depend_tx if: + * 1/ a dependency exists (depend_tx is !NULL) + * 2/ the tx can not be submitted to the current channel + */ + if (depend_tx && depend_tx->chan != chan) { + /* if ack is already set then we cannot be sure + * we are referring to the correct operation + */ + BUG_ON(depend_tx->ack); + + tx->parent = depend_tx; + spin_lock_bh(&depend_tx->lock); + list_add_tail(&tx->depend_node, &depend_tx->depend_list); + if (depend_tx->cookie == 0) { + struct dma_chan *dep_chan = depend_tx->chan; + struct dma_device *dep_dev = dep_chan->device; + dep_dev->device_dependency_added(dep_chan); + } + spin_unlock_bh(&depend_tx->lock); + + /* schedule an interrupt to trigger the channel switch */ + async_trigger_callback(ASYNC_TX_ACK, depend_tx, NULL, NULL); + } else { + tx->parent = NULL; + tx->tx_submit(tx); + } + + if (flags & ASYNC_TX_ACK) + async_tx_ack(tx); + + if (depend_tx && (flags & ASYNC_TX_DEP_ACK)) + async_tx_ack(depend_tx); +} +EXPORT_SYMBOL_GPL(async_tx_submit); + +/** + * async_trigger_callback - schedules the callback function to be run after + * any dependent operations have been completed. + * @flags: ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: 'callback' requires the completion of this transaction + * @cb_fn: function to call after depend_tx completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_trigger_callback(enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan; + struct dma_device *device; + struct dma_async_tx_descriptor *tx; + + if (depend_tx) { + chan = depend_tx->chan; + device = chan->device; + + /* see if we can schedule an interrupt + * otherwise poll for completion + */ + if (device && !dma_has_cap(DMA_INTERRUPT, device->cap_mask)) + device = NULL; + + tx = device ? device->device_prep_dma_interrupt(chan) : NULL; + } else + tx = NULL; + + if (tx) { + pr_debug("%s: (async)\n", __FUNCTION__); + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + } else { + pr_debug("%s: (sync)\n", __FUNCTION__); + + /* wait for any prerequisite operations */ + if (depend_tx) { + /* if ack is already set then we cannot be sure + * we are referring to the correct operation + */ + BUG_ON(depend_tx->ack); + if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) + panic("%s: DMA_ERROR waiting for depend_tx\n", + __FUNCTION__); + } + + async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_trigger_callback); + +module_init(async_tx_init); +module_exit(async_tx_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Asynchronous Bulk Memory Transactions API"); +MODULE_LICENSE("GPL"); diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c new file mode 100644 index 000000000000..2575f674dcd5 --- /dev/null +++ b/crypto/async_tx/async_xor.c @@ -0,0 +1,327 @@ +/* + * xor offload engine api + * + * Copyright © 2006, Intel Corporation. + * + * Dan Williams + * + * with architecture considerations by: + * Neil Brown + * Jeff Garzik + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include +#include +#include +#include +#include + +static void +do_async_xor(struct dma_async_tx_descriptor *tx, struct dma_device *device, + struct dma_chan *chan, struct page *dest, struct page **src_list, + unsigned int offset, unsigned int src_cnt, size_t len, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + dma_addr_t dma_addr; + enum dma_data_direction dir; + int i; + + pr_debug("%s: len: %zu\n", __FUNCTION__, len); + + dir = (flags & ASYNC_TX_ASSUME_COHERENT) ? + DMA_NONE : DMA_FROM_DEVICE; + + dma_addr = dma_map_page(device->dev, dest, offset, len, dir); + tx->tx_set_dest(dma_addr, tx, 0); + + dir = (flags & ASYNC_TX_ASSUME_COHERENT) ? + DMA_NONE : DMA_TO_DEVICE; + + for (i = 0; i < src_cnt; i++) { + dma_addr = dma_map_page(device->dev, src_list[i], + offset, len, dir); + tx->tx_set_src(dma_addr, tx, i); + } + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); +} + +static void +do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, + unsigned int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + void *_dest; + int i; + + pr_debug("%s: len: %zu\n", __FUNCTION__, len); + + /* reuse the 'src_list' array to convert to buffer pointers */ + for (i = 0; i < src_cnt; i++) + src_list[i] = (struct page *) + (page_address(src_list[i]) + offset); + + /* set destination address */ + _dest = page_address(dest) + offset; + + if (flags & ASYNC_TX_XOR_ZERO_DST) + memset(_dest, 0, len); + + xor_blocks(src_cnt, len, _dest, + (void **) src_list); + + async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param); +} + +/** + * async_xor - attempt to xor a set of blocks with a dma engine. + * xor_blocks always uses the dest as a source so the ASYNC_TX_XOR_ZERO_DST + * flag must be set to not include dest data in the calculation. The + * assumption with dma eninges is that they only use the destination + * buffer as a source when it is explicity specified in the source list. + * @dest: destination page + * @src_list: array of source pages (if the dest is also a source it must be + * at index zero). The contents of this array may be overwritten. + * @offset: offset in pages to start transaction + * @src_cnt: number of source pages + * @len: length in bytes + * @flags: ASYNC_TX_XOR_ZERO_DST, ASYNC_TX_XOR_DROP_DEST, + * ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: xor depends on the result of this transaction. + * @cb_fn: function to call when the xor completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_xor(struct page *dest, struct page **src_list, unsigned int offset, + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_XOR); + struct dma_device *device = chan ? chan->device : NULL; + struct dma_async_tx_descriptor *tx = NULL; + dma_async_tx_callback _cb_fn; + void *_cb_param; + unsigned long local_flags; + int xor_src_cnt; + int i = 0, src_off = 0, int_en; + + BUG_ON(src_cnt <= 1); + + while (src_cnt) { + local_flags = flags; + if (device) { /* run the xor asynchronously */ + xor_src_cnt = min(src_cnt, device->max_xor); + /* if we are submitting additional xors + * only set the callback on the last transaction + */ + if (src_cnt > xor_src_cnt) { + local_flags &= ~ASYNC_TX_ACK; + _cb_fn = NULL; + _cb_param = NULL; + } else { + _cb_fn = cb_fn; + _cb_param = cb_param; + } + + int_en = _cb_fn ? 1 : 0; + + tx = device->device_prep_dma_xor( + chan, xor_src_cnt, len, int_en); + + if (tx) { + do_async_xor(tx, device, chan, dest, + &src_list[src_off], offset, xor_src_cnt, len, + local_flags, depend_tx, _cb_fn, + _cb_param); + } else /* fall through */ + goto xor_sync; + } else { /* run the xor synchronously */ +xor_sync: + /* in the sync case the dest is an implied source + * (assumes the dest is at the src_off index) + */ + if (flags & ASYNC_TX_XOR_DROP_DST) { + src_cnt--; + src_off++; + } + + /* process up to 'MAX_XOR_BLOCKS' sources */ + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); + + /* if we are submitting additional xors + * only set the callback on the last transaction + */ + if (src_cnt > xor_src_cnt) { + local_flags &= ~ASYNC_TX_ACK; + _cb_fn = NULL; + _cb_param = NULL; + } else { + _cb_fn = cb_fn; + _cb_param = cb_param; + } + + /* wait for any prerequisite operations */ + if (depend_tx) { + /* if ack is already set then we cannot be sure + * we are referring to the correct operation + */ + BUG_ON(depend_tx->ack); + if (dma_wait_for_async_tx(depend_tx) == + DMA_ERROR) + panic("%s: DMA_ERROR waiting for " + "depend_tx\n", + __FUNCTION__); + } + + do_sync_xor(dest, &src_list[src_off], offset, + xor_src_cnt, len, local_flags, depend_tx, + _cb_fn, _cb_param); + } + + /* the previous tx is hidden from the client, + * so ack it + */ + if (i && depend_tx) + async_tx_ack(depend_tx); + + depend_tx = tx; + + if (src_cnt > xor_src_cnt) { + /* drop completed sources */ + src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; + + /* unconditionally preserve the destination */ + flags &= ~ASYNC_TX_XOR_ZERO_DST; + + /* use the intermediate result a source, but remember + * it's dropped, because it's implied, in the sync case + */ + src_list[--src_off] = dest; + src_cnt++; + flags |= ASYNC_TX_XOR_DROP_DST; + } else + src_cnt = 0; + i++; + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_xor); + +static int page_is_zero(struct page *p, unsigned int offset, size_t len) +{ + char *a = page_address(p) + offset; + return ((*(u32 *) a) == 0 && + memcmp(a, a + 4, len - 4) == 0); +} + +/** + * async_xor_zero_sum - attempt a xor parity check with a dma engine. + * @dest: destination page used if the xor is performed synchronously + * @src_list: array of source pages. The dest page must be listed as a source + * at index zero. The contents of this array may be overwritten. + * @offset: offset in pages to start transaction + * @src_cnt: number of source pages + * @len: length in bytes + * @result: 0 if sum == 0 else non-zero + * @flags: ASYNC_TX_ASSUME_COHERENT, ASYNC_TX_ACK, ASYNC_TX_DEP_ACK + * @depend_tx: xor depends on the result of this transaction. + * @cb_fn: function to call when the xor completes + * @cb_param: parameter to pass to the callback routine + */ +struct dma_async_tx_descriptor * +async_xor_zero_sum(struct page *dest, struct page **src_list, + unsigned int offset, int src_cnt, size_t len, + u32 *result, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_param) +{ + struct dma_chan *chan = async_tx_find_channel(depend_tx, DMA_ZERO_SUM); + struct dma_device *device = chan ? chan->device : NULL; + int int_en = cb_fn ? 1 : 0; + struct dma_async_tx_descriptor *tx = device ? + device->device_prep_dma_zero_sum(chan, src_cnt, len, result, + int_en) : NULL; + int i; + + BUG_ON(src_cnt <= 1); + + if (tx) { + dma_addr_t dma_addr; + enum dma_data_direction dir; + + pr_debug("%s: (async) len: %zu\n", __FUNCTION__, len); + + dir = (flags & ASYNC_TX_ASSUME_COHERENT) ? + DMA_NONE : DMA_TO_DEVICE; + + for (i = 0; i < src_cnt; i++) { + dma_addr = dma_map_page(device->dev, src_list[i], + offset, len, dir); + tx->tx_set_src(dma_addr, tx, i); + } + + async_tx_submit(chan, tx, flags, depend_tx, cb_fn, cb_param); + } else { + unsigned long xor_flags = flags; + + pr_debug("%s: (sync) len: %zu\n", __FUNCTION__, len); + + xor_flags |= ASYNC_TX_XOR_DROP_DST; + xor_flags &= ~ASYNC_TX_ACK; + + tx = async_xor(dest, src_list, offset, src_cnt, len, xor_flags, + depend_tx, NULL, NULL); + + if (tx) { + if (dma_wait_for_async_tx(tx) == DMA_ERROR) + panic("%s: DMA_ERROR waiting for tx\n", + __FUNCTION__); + async_tx_ack(tx); + } + + *result = page_is_zero(dest, offset, len) ? 0 : 1; + + tx = NULL; + + async_tx_sync_epilog(flags, depend_tx, cb_fn, cb_param); + } + + return tx; +} +EXPORT_SYMBOL_GPL(async_xor_zero_sum); + +static int __init async_xor_init(void) +{ + return 0; +} + +static void __exit async_xor_exit(void) +{ + do { } while (0); +} + +module_init(async_xor_init); +module_exit(async_xor_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("asynchronous xor/xor-zero-sum api"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/xor.c b/crypto/xor.c similarity index 69% rename from drivers/md/xor.c rename to crypto/xor.c index 324897c4be4e..b2e6db075e49 100644 --- a/drivers/md/xor.c +++ b/crypto/xor.c @@ -26,32 +26,32 @@ static struct xor_block_template *active_template; void -xor_block(unsigned int count, unsigned int bytes, void **ptr) +xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) { - unsigned long *p0, *p1, *p2, *p3, *p4; + unsigned long *p1, *p2, *p3, *p4; - p0 = (unsigned long *) ptr[0]; - p1 = (unsigned long *) ptr[1]; - if (count == 2) { - active_template->do_2(bytes, p0, p1); + p1 = (unsigned long *) srcs[0]; + if (src_count == 1) { + active_template->do_2(bytes, dest, p1); return; } - p2 = (unsigned long *) ptr[2]; - if (count == 3) { - active_template->do_3(bytes, p0, p1, p2); + p2 = (unsigned long *) srcs[1]; + if (src_count == 2) { + active_template->do_3(bytes, dest, p1, p2); return; } - p3 = (unsigned long *) ptr[3]; - if (count == 4) { - active_template->do_4(bytes, p0, p1, p2, p3); + p3 = (unsigned long *) srcs[2]; + if (src_count == 3) { + active_template->do_4(bytes, dest, p1, p2, p3); return; } - p4 = (unsigned long *) ptr[4]; - active_template->do_5(bytes, p0, p1, p2, p3, p4); + p4 = (unsigned long *) srcs[3]; + active_template->do_5(bytes, dest, p1, p2, p3, p4); } +EXPORT_SYMBOL(xor_blocks); /* Set of all registered templates. */ static struct xor_block_template *template_list; @@ -78,7 +78,7 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) now = jiffies; count = 0; while (jiffies == now) { - mb(); + mb(); /* prevent loop optimzation */ tmpl->do_2(BENCH_SIZE, b1, b2); mb(); count++; @@ -91,26 +91,26 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) speed = max * (HZ * BENCH_SIZE / 1024); tmpl->speed = speed; - printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name, + printk(KERN_INFO " %-10s: %5d.%03d MB/sec\n", tmpl->name, speed / 1000, speed % 1000); } -static int -calibrate_xor_block(void) +static int __init +calibrate_xor_blocks(void) { void *b1, *b2; struct xor_block_template *f, *fastest; b1 = (void *) __get_free_pages(GFP_KERNEL, 2); - if (! b1) { - printk("raid5: Yikes! No memory available.\n"); + if (!b1) { + printk(KERN_WARNING "xor: Yikes! No memory available.\n"); return -ENOMEM; } b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; /* - * If this arch/cpu has a short-circuited selection, don't loop through all - * the possible functions, just test the best one + * If this arch/cpu has a short-circuited selection, don't loop through + * all the possible functions, just test the best one */ fastest = NULL; @@ -122,11 +122,12 @@ calibrate_xor_block(void) #define xor_speed(templ) do_xor_speed((templ), b1, b2) if (fastest) { - printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n", + printk(KERN_INFO "xor: automatically using best " + "checksumming function: %s\n", fastest->name); xor_speed(fastest); } else { - printk(KERN_INFO "raid5: measuring checksumming speed\n"); + printk(KERN_INFO "xor: measuring software checksum speed\n"); XOR_TRY_TEMPLATES; fastest = template_list; for (f = fastest; f; f = f->next) @@ -134,7 +135,7 @@ calibrate_xor_block(void) fastest = f; } - printk("raid5: using function: %s (%d.%03d MB/sec)\n", + printk(KERN_INFO "xor: using function: %s (%d.%03d MB/sec)\n", fastest->name, fastest->speed / 1000, fastest->speed % 1000); #undef xor_speed @@ -147,8 +148,8 @@ calibrate_xor_block(void) static __exit void xor_exit(void) { } -EXPORT_SYMBOL(xor_block); MODULE_LICENSE("GPL"); -module_init(calibrate_xor_block); +/* when built-in xor.o must initialize before drivers/md/md.o */ +core_initcall(calibrate_xor_blocks); module_exit(xor_exit); diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 72be6c63edfc..b31756d59978 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -8,8 +8,8 @@ menu "DMA Engine support" config DMA_ENGINE bool "Support for DMA engines" ---help--- - DMA engines offload copy operations from the CPU to dedicated - hardware, allowing the copies to happen asynchronously. + DMA engines offload bulk memory operations from the CPU to dedicated + hardware, allowing the operations to happen asynchronously. comment "DMA Clients" @@ -32,4 +32,12 @@ config INTEL_IOATDMA ---help--- Enable support for the Intel(R) I/OAT DMA engine. +config INTEL_IOP_ADMA + tristate "Intel IOP ADMA support" + depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX) + select ASYNC_CORE + default m + ---help--- + Enable support for the Intel(R) IOP Series RAID engines. + endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index bdcfdbdb1aec..b3839b687ae0 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 322ee2984e3d..82489923af09 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -37,11 +37,11 @@ * Each device has a channels list, which runs unlocked but is never modified * once the device is registered, it's just setup by the driver. * - * Each client has a channels list, it's only modified under the client->lock - * and in an RCU callback, so it's safe to read under rcu_read_lock(). + * Each client is responsible for keeping track of the channels it uses. See + * the definition of dma_event_callback in dmaengine.h. * * Each device has a kref, which is initialized to 1 when the device is - * registered. A kref_put is done for each class_device registered. When the + * registered. A kref_get is done for each class_device registered. When the * class_device is released, the coresponding kref_put is done in the release * method. Every time one of the device's channels is allocated to a client, * a kref_get occurs. When the channel is freed, the coresponding kref_put @@ -51,14 +51,17 @@ * references to finish. * * Each channel has an open-coded implementation of Rusty Russell's "bigref," - * with a kref and a per_cpu local_t. A single reference is set when on an - * ADDED event, and removed with a REMOVE event. Net DMA client takes an - * extra reference per outstanding transaction. The relase function does a - * kref_put on the device. -ChrisL + * with a kref and a per_cpu local_t. A dma_chan_get is called when a client + * signals that it wants to use a channel, and dma_chan_put is called when + * a channel is removed or a client using it is unregesitered. A client can + * take extra references per outstanding transaction, as is the case with + * the NET DMA client. The release function does a kref_put on the device. + * -ChrisL, DanW */ #include #include +#include #include #include #include @@ -66,6 +69,7 @@ #include #include #include +#include static DEFINE_MUTEX(dma_list_mutex); static LIST_HEAD(dma_device_list); @@ -100,8 +104,19 @@ static ssize_t show_bytes_transferred(struct class_device *cd, char *buf) static ssize_t show_in_use(struct class_device *cd, char *buf) { struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + int in_use = 0; - return sprintf(buf, "%d\n", (chan->client ? 1 : 0)); + if (unlikely(chan->slow_ref) && + atomic_read(&chan->refcount.refcount) > 1) + in_use = 1; + else { + if (local_read(&(per_cpu_ptr(chan->local, + get_cpu())->refcount)) > 0) + in_use = 1; + put_cpu(); + } + + return sprintf(buf, "%d\n", in_use); } static struct class_device_attribute dma_class_attrs[] = { @@ -127,44 +142,73 @@ static struct class dma_devclass = { /* --- client and device registration --- */ +#define dma_chan_satisfies_mask(chan, mask) \ + __dma_chan_satisfies_mask((chan), &(mask)) +static int +__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want) +{ + dma_cap_mask_t has; + + bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits, + DMA_TX_TYPE_END); + return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END); +} + /** - * dma_client_chan_alloc - try to allocate a channel to a client + * dma_client_chan_alloc - try to allocate channels to a client * @client: &dma_client * * Called with dma_list_mutex held. */ -static struct dma_chan *dma_client_chan_alloc(struct dma_client *client) +static void dma_client_chan_alloc(struct dma_client *client) { struct dma_device *device; struct dma_chan *chan; - unsigned long flags; int desc; /* allocated descriptor count */ + enum dma_state_client ack; - /* Find a channel, any DMA engine will do */ - list_for_each_entry(device, &dma_device_list, global_node) { + /* Find a channel */ + list_for_each_entry(device, &dma_device_list, global_node) list_for_each_entry(chan, &device->channels, device_node) { - if (chan->client) + if (!dma_chan_satisfies_mask(chan, client->cap_mask)) continue; desc = chan->device->device_alloc_chan_resources(chan); if (desc >= 0) { - kref_get(&device->refcount); - kref_init(&chan->refcount); - chan->slow_ref = 0; - INIT_RCU_HEAD(&chan->rcu); - chan->client = client; - spin_lock_irqsave(&client->lock, flags); - list_add_tail_rcu(&chan->client_node, - &client->channels); - spin_unlock_irqrestore(&client->lock, flags); - return chan; + ack = client->event_callback(client, + chan, + DMA_RESOURCE_AVAILABLE); + + /* we are done once this client rejects + * an available resource + */ + if (ack == DMA_ACK) { + dma_chan_get(chan); + kref_get(&device->refcount); + } else if (ack == DMA_NAK) + return; } } - } - - return NULL; } +enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) +{ + enum dma_status status; + unsigned long dma_sync_wait_timeout = jiffies + msecs_to_jiffies(5000); + + dma_async_issue_pending(chan); + do { + status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); + if (time_after_eq(jiffies, dma_sync_wait_timeout)) { + printk(KERN_ERR "dma_sync_wait_timeout!\n"); + return DMA_ERROR; + } + } while (status == DMA_IN_PROGRESS); + + return status; +} +EXPORT_SYMBOL(dma_sync_wait); + /** * dma_chan_cleanup - release a DMA channel's resources * @kref: kernel reference structure that contains the DMA channel device @@ -173,7 +217,6 @@ void dma_chan_cleanup(struct kref *kref) { struct dma_chan *chan = container_of(kref, struct dma_chan, refcount); chan->device->device_free_chan_resources(chan); - chan->client = NULL; kref_put(&chan->device->refcount, dma_async_device_cleanup); } EXPORT_SYMBOL(dma_chan_cleanup); @@ -189,7 +232,7 @@ static void dma_chan_free_rcu(struct rcu_head *rcu) kref_put(&chan->refcount, dma_chan_cleanup); } -static void dma_client_chan_free(struct dma_chan *chan) +static void dma_chan_release(struct dma_chan *chan) { atomic_add(0x7FFFFFFF, &chan->refcount.refcount); chan->slow_ref = 1; @@ -197,41 +240,42 @@ static void dma_client_chan_free(struct dma_chan *chan) } /** - * dma_chans_rebalance - reallocate channels to clients - * - * When the number of DMA channel in the system changes, - * channels need to be rebalanced among clients. + * dma_chans_notify_available - broadcast available channels to the clients */ -static void dma_chans_rebalance(void) +static void dma_clients_notify_available(void) { struct dma_client *client; - struct dma_chan *chan; - unsigned long flags; + + mutex_lock(&dma_list_mutex); + + list_for_each_entry(client, &dma_client_list, global_node) + dma_client_chan_alloc(client); + + mutex_unlock(&dma_list_mutex); +} + +/** + * dma_chans_notify_available - tell the clients that a channel is going away + * @chan: channel on its way out + */ +static void dma_clients_notify_removed(struct dma_chan *chan) +{ + struct dma_client *client; + enum dma_state_client ack; mutex_lock(&dma_list_mutex); list_for_each_entry(client, &dma_client_list, global_node) { - while (client->chans_desired > client->chan_count) { - chan = dma_client_chan_alloc(client); - if (!chan) - break; - client->chan_count++; - client->event_callback(client, - chan, - DMA_RESOURCE_ADDED); - } - while (client->chans_desired < client->chan_count) { - spin_lock_irqsave(&client->lock, flags); - chan = list_entry(client->channels.next, - struct dma_chan, - client_node); - list_del_rcu(&chan->client_node); - spin_unlock_irqrestore(&client->lock, flags); - client->chan_count--; - client->event_callback(client, - chan, - DMA_RESOURCE_REMOVED); - dma_client_chan_free(chan); + ack = client->event_callback(client, chan, + DMA_RESOURCE_REMOVED); + + /* client was holding resources for this channel so + * free it + */ + if (ack == DMA_ACK) { + dma_chan_put(chan); + kref_put(&chan->device->refcount, + dma_async_device_cleanup); } } @@ -239,28 +283,14 @@ static void dma_chans_rebalance(void) } /** - * dma_async_client_register - allocate and register a &dma_client - * @event_callback: callback for notification of channel addition/removal + * dma_async_client_register - register a &dma_client + * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask' */ -struct dma_client *dma_async_client_register(dma_event_callback event_callback) +void dma_async_client_register(struct dma_client *client) { - struct dma_client *client; - - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (!client) - return NULL; - - INIT_LIST_HEAD(&client->channels); - spin_lock_init(&client->lock); - client->chans_desired = 0; - client->chan_count = 0; - client->event_callback = event_callback; - mutex_lock(&dma_list_mutex); list_add_tail(&client->global_node, &dma_client_list); mutex_unlock(&dma_list_mutex); - - return client; } EXPORT_SYMBOL(dma_async_client_register); @@ -272,40 +302,42 @@ EXPORT_SYMBOL(dma_async_client_register); */ void dma_async_client_unregister(struct dma_client *client) { + struct dma_device *device; struct dma_chan *chan; + enum dma_state_client ack; if (!client) return; - rcu_read_lock(); - list_for_each_entry_rcu(chan, &client->channels, client_node) - dma_client_chan_free(chan); - rcu_read_unlock(); - mutex_lock(&dma_list_mutex); + /* free all channels the client is holding */ + list_for_each_entry(device, &dma_device_list, global_node) + list_for_each_entry(chan, &device->channels, device_node) { + ack = client->event_callback(client, chan, + DMA_RESOURCE_REMOVED); + + if (ack == DMA_ACK) { + dma_chan_put(chan); + kref_put(&chan->device->refcount, + dma_async_device_cleanup); + } + } + list_del(&client->global_node); mutex_unlock(&dma_list_mutex); - - kfree(client); - dma_chans_rebalance(); } EXPORT_SYMBOL(dma_async_client_unregister); /** - * dma_async_client_chan_request - request DMA channels - * @client: &dma_client - * @number: count of DMA channels requested - * - * Clients call dma_async_client_chan_request() to specify how many - * DMA channels they need, 0 to free all currently allocated. - * The resulting allocations/frees are indicated to the client via the - * event callback. + * dma_async_client_chan_request - send all available channels to the + * client that satisfy the capability mask + * @client - requester */ -void dma_async_client_chan_request(struct dma_client *client, - unsigned int number) +void dma_async_client_chan_request(struct dma_client *client) { - client->chans_desired = number; - dma_chans_rebalance(); + mutex_lock(&dma_list_mutex); + dma_client_chan_alloc(client); + mutex_unlock(&dma_list_mutex); } EXPORT_SYMBOL(dma_async_client_chan_request); @@ -316,12 +348,31 @@ EXPORT_SYMBOL(dma_async_client_chan_request); int dma_async_device_register(struct dma_device *device) { static int id; - int chancnt = 0; + int chancnt = 0, rc; struct dma_chan* chan; if (!device) return -ENODEV; + /* validate device routines */ + BUG_ON(dma_has_cap(DMA_MEMCPY, device->cap_mask) && + !device->device_prep_dma_memcpy); + BUG_ON(dma_has_cap(DMA_XOR, device->cap_mask) && + !device->device_prep_dma_xor); + BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) && + !device->device_prep_dma_zero_sum); + BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) && + !device->device_prep_dma_memset); + BUG_ON(dma_has_cap(DMA_ZERO_SUM, device->cap_mask) && + !device->device_prep_dma_interrupt); + + BUG_ON(!device->device_alloc_chan_resources); + BUG_ON(!device->device_free_chan_resources); + BUG_ON(!device->device_dependency_added); + BUG_ON(!device->device_is_tx_complete); + BUG_ON(!device->device_issue_pending); + BUG_ON(!device->dev); + init_completion(&device->done); kref_init(&device->refcount); device->dev_id = id++; @@ -338,17 +389,38 @@ int dma_async_device_register(struct dma_device *device) snprintf(chan->class_dev.class_id, BUS_ID_SIZE, "dma%dchan%d", device->dev_id, chan->chan_id); + rc = class_device_register(&chan->class_dev); + if (rc) { + chancnt--; + free_percpu(chan->local); + chan->local = NULL; + goto err_out; + } + kref_get(&device->refcount); - class_device_register(&chan->class_dev); + kref_init(&chan->refcount); + chan->slow_ref = 0; + INIT_RCU_HEAD(&chan->rcu); } mutex_lock(&dma_list_mutex); list_add_tail(&device->global_node, &dma_device_list); mutex_unlock(&dma_list_mutex); - dma_chans_rebalance(); + dma_clients_notify_available(); return 0; + +err_out: + list_for_each_entry(chan, &device->channels, device_node) { + if (chan->local == NULL) + continue; + kref_put(&device->refcount, dma_async_device_cleanup); + class_device_unregister(&chan->class_dev); + chancnt--; + free_percpu(chan->local); + } + return rc; } EXPORT_SYMBOL(dma_async_device_register); @@ -371,32 +443,165 @@ static void dma_async_device_cleanup(struct kref *kref) void dma_async_device_unregister(struct dma_device *device) { struct dma_chan *chan; - unsigned long flags; mutex_lock(&dma_list_mutex); list_del(&device->global_node); mutex_unlock(&dma_list_mutex); list_for_each_entry(chan, &device->channels, device_node) { - if (chan->client) { - spin_lock_irqsave(&chan->client->lock, flags); - list_del(&chan->client_node); - chan->client->chan_count--; - spin_unlock_irqrestore(&chan->client->lock, flags); - chan->client->event_callback(chan->client, - chan, - DMA_RESOURCE_REMOVED); - dma_client_chan_free(chan); - } + dma_clients_notify_removed(chan); class_device_unregister(&chan->class_dev); + dma_chan_release(chan); } - dma_chans_rebalance(); kref_put(&device->refcount, dma_async_device_cleanup); wait_for_completion(&device->done); } EXPORT_SYMBOL(dma_async_device_unregister); +/** + * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses + * @chan: DMA channel to offload copy to + * @dest: destination address (virtual) + * @src: source address (virtual) + * @len: length + * + * Both @dest and @src must be mappable to a bus address according to the + * DMA mapping API rules for streaming mappings. + * Both @dest and @src must stay memory resident (kernel memory or locked + * user space pages). + */ +dma_cookie_t +dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, + void *src, size_t len) +{ + struct dma_device *dev = chan->device; + struct dma_async_tx_descriptor *tx; + dma_addr_t addr; + dma_cookie_t cookie; + int cpu; + + tx = dev->device_prep_dma_memcpy(chan, len, 0); + if (!tx) + return -ENOMEM; + + tx->ack = 1; + tx->callback = NULL; + addr = dma_map_single(dev->dev, src, len, DMA_TO_DEVICE); + tx->tx_set_src(addr, tx, 0); + addr = dma_map_single(dev->dev, dest, len, DMA_FROM_DEVICE); + tx->tx_set_dest(addr, tx, 0); + cookie = tx->tx_submit(tx); + + cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return cookie; +} +EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); + +/** + * dma_async_memcpy_buf_to_pg - offloaded copy from address to page + * @chan: DMA channel to offload copy to + * @page: destination page + * @offset: offset in page to copy to + * @kdata: source address (virtual) + * @len: length + * + * Both @page/@offset and @kdata must be mappable to a bus address according + * to the DMA mapping API rules for streaming mappings. + * Both @page/@offset and @kdata must stay memory resident (kernel memory or + * locked user space pages) + */ +dma_cookie_t +dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page, + unsigned int offset, void *kdata, size_t len) +{ + struct dma_device *dev = chan->device; + struct dma_async_tx_descriptor *tx; + dma_addr_t addr; + dma_cookie_t cookie; + int cpu; + + tx = dev->device_prep_dma_memcpy(chan, len, 0); + if (!tx) + return -ENOMEM; + + tx->ack = 1; + tx->callback = NULL; + addr = dma_map_single(dev->dev, kdata, len, DMA_TO_DEVICE); + tx->tx_set_src(addr, tx, 0); + addr = dma_map_page(dev->dev, page, offset, len, DMA_FROM_DEVICE); + tx->tx_set_dest(addr, tx, 0); + cookie = tx->tx_submit(tx); + + cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return cookie; +} +EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); + +/** + * dma_async_memcpy_pg_to_pg - offloaded copy from page to page + * @chan: DMA channel to offload copy to + * @dest_pg: destination page + * @dest_off: offset in page to copy to + * @src_pg: source page + * @src_off: offset in page to copy from + * @len: length + * + * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus + * address according to the DMA mapping API rules for streaming mappings. + * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident + * (kernel memory or locked user space pages). + */ +dma_cookie_t +dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg, + unsigned int dest_off, struct page *src_pg, unsigned int src_off, + size_t len) +{ + struct dma_device *dev = chan->device; + struct dma_async_tx_descriptor *tx; + dma_addr_t addr; + dma_cookie_t cookie; + int cpu; + + tx = dev->device_prep_dma_memcpy(chan, len, 0); + if (!tx) + return -ENOMEM; + + tx->ack = 1; + tx->callback = NULL; + addr = dma_map_page(dev->dev, src_pg, src_off, len, DMA_TO_DEVICE); + tx->tx_set_src(addr, tx, 0); + addr = dma_map_page(dev->dev, dest_pg, dest_off, len, DMA_FROM_DEVICE); + tx->tx_set_dest(addr, tx, 0); + cookie = tx->tx_submit(tx); + + cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return cookie; +} +EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); + +void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, + struct dma_chan *chan) +{ + tx->chan = chan; + spin_lock_init(&tx->lock); + INIT_LIST_HEAD(&tx->depend_node); + INIT_LIST_HEAD(&tx->depend_list); +} +EXPORT_SYMBOL(dma_async_tx_descriptor_init); + static int __init dma_bus_init(void) { mutex_init(&dma_list_mutex); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 850014139556..5fbe56b5cea0 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -32,16 +32,17 @@ #include #include #include "ioatdma.h" -#include "ioatdma_io.h" #include "ioatdma_registers.h" #include "ioatdma_hw.h" #define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common) #define to_ioat_device(dev) container_of(dev, struct ioat_device, common) #define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node) +#define tx_to_ioat_desc(tx) container_of(tx, struct ioat_desc_sw, async_tx) /* internal functions */ static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent); +static void ioat_shutdown(struct pci_dev *pdev); static void __devexit ioat_remove(struct pci_dev *pdev); static int enumerate_dma_channels(struct ioat_device *device) @@ -51,8 +52,8 @@ static int enumerate_dma_channels(struct ioat_device *device) int i; struct ioat_dma_chan *ioat_chan; - device->common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); - xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); + device->common.chancnt = readb(device->reg_base + IOAT_CHANCNT_OFFSET); + xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET); xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); for (i = 0; i < device->common.chancnt; i++) { @@ -71,13 +72,79 @@ static int enumerate_dma_channels(struct ioat_device *device) INIT_LIST_HEAD(&ioat_chan->used_desc); /* This should be made common somewhere in dmaengine.c */ ioat_chan->common.device = &device->common; - ioat_chan->common.client = NULL; list_add_tail(&ioat_chan->common.device_node, &device->common.channels); } return device->common.chancnt; } +static void +ioat_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx, int index) +{ + struct ioat_desc_sw *iter, *desc = tx_to_ioat_desc(tx); + struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan); + + pci_unmap_addr_set(desc, src, addr); + + list_for_each_entry(iter, &desc->async_tx.tx_list, node) { + iter->hw->src_addr = addr; + addr += ioat_chan->xfercap; + } + +} + +static void +ioat_set_dest(dma_addr_t addr, struct dma_async_tx_descriptor *tx, int index) +{ + struct ioat_desc_sw *iter, *desc = tx_to_ioat_desc(tx); + struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan); + + pci_unmap_addr_set(desc, dst, addr); + + list_for_each_entry(iter, &desc->async_tx.tx_list, node) { + iter->hw->dst_addr = addr; + addr += ioat_chan->xfercap; + } +} + +static dma_cookie_t +ioat_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(tx->chan); + struct ioat_desc_sw *desc = tx_to_ioat_desc(tx); + int append = 0; + dma_cookie_t cookie; + struct ioat_desc_sw *group_start; + + group_start = list_entry(desc->async_tx.tx_list.next, + struct ioat_desc_sw, node); + spin_lock_bh(&ioat_chan->desc_lock); + /* cookie incr and addition to used_list must be atomic */ + cookie = ioat_chan->common.cookie; + cookie++; + if (cookie < 0) + cookie = 1; + ioat_chan->common.cookie = desc->async_tx.cookie = cookie; + + /* write address into NextDescriptor field of last desc in chain */ + to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = + group_start->async_tx.phys; + list_splice_init(&desc->async_tx.tx_list, ioat_chan->used_desc.prev); + + ioat_chan->pending += desc->tx_cnt; + if (ioat_chan->pending >= 4) { + append = 1; + ioat_chan->pending = 0; + } + spin_unlock_bh(&ioat_chan->desc_lock); + + if (append) + writeb(IOAT_CHANCMD_APPEND, + ioat_chan->reg_base + IOAT_CHANCMD_OFFSET); + + return cookie; +} + static struct ioat_desc_sw *ioat_dma_alloc_descriptor( struct ioat_dma_chan *ioat_chan, gfp_t flags) @@ -99,8 +166,13 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor( } memset(desc, 0, sizeof(*desc)); + dma_async_tx_descriptor_init(&desc_sw->async_tx, &ioat_chan->common); + desc_sw->async_tx.tx_set_src = ioat_set_src; + desc_sw->async_tx.tx_set_dest = ioat_set_dest; + desc_sw->async_tx.tx_submit = ioat_tx_submit; + INIT_LIST_HEAD(&desc_sw->async_tx.tx_list); desc_sw->hw = desc; - desc_sw->phys = phys; + desc_sw->async_tx.phys = phys; return desc_sw; } @@ -123,7 +195,7 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) * In-use bit automatically set by reading chanctrl * If 0, we got it, if 1, someone else did */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET); if (chanctrl & IOAT_CHANCTRL_CHANNEL_IN_USE) return -EBUSY; @@ -132,12 +204,12 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) IOAT_CHANCTRL_ERR_INT_EN | IOAT_CHANCTRL_ANY_ERR_ABORT_EN | IOAT_CHANCTRL_ERR_COMPLETION_EN; - ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); + writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET); - chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET); + chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); if (chanerr) { printk("IOAT: CHANERR = %x, clearing\n", chanerr); - ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr); + writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); } /* Allocate descriptors */ @@ -161,10 +233,10 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) &ioat_chan->completion_addr); memset(ioat_chan->completion_virt, 0, sizeof(*ioat_chan->completion_virt)); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW, - ((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH, - ((u64) ioat_chan->completion_addr) >> 32); + writel(((u64) ioat_chan->completion_addr) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_LOW); + writel(((u64) ioat_chan->completion_addr) >> 32, + ioat_chan->reg_base + IOAT_CHANCMP_OFFSET_HIGH); ioat_start_null_desc(ioat_chan); return i; @@ -182,18 +254,20 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan) ioat_dma_memcpy_cleanup(ioat_chan); - ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); + writeb(IOAT_CHANCMD_RESET, ioat_chan->reg_base + IOAT_CHANCMD_OFFSET); spin_lock_bh(&ioat_chan->desc_lock); list_for_each_entry_safe(desc, _desc, &ioat_chan->used_desc, node) { in_use_descs++; list_del(&desc->node); - pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys); + pci_pool_free(ioat_device->dma_pool, desc->hw, + desc->async_tx.phys); kfree(desc); } list_for_each_entry_safe(desc, _desc, &ioat_chan->free_desc, node) { list_del(&desc->node); - pci_pool_free(ioat_device->dma_pool, desc->hw, desc->phys); + pci_pool_free(ioat_device->dma_pool, desc->hw, + desc->async_tx.phys); kfree(desc); } spin_unlock_bh(&ioat_chan->desc_lock); @@ -210,50 +284,30 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan) ioat_chan->last_completion = ioat_chan->completion_addr = 0; /* Tell hw the chan is free */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET); chanctrl &= ~IOAT_CHANCTRL_CHANNEL_IN_USE; - ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); + writew(chanctrl, ioat_chan->reg_base + IOAT_CHANCTRL_OFFSET); } -/** - * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction - * @ioat_chan: IOAT DMA channel handle - * @dest: DMA destination address - * @src: DMA source address - * @len: transaction length in bytes - */ - -static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan, - dma_addr_t dest, - dma_addr_t src, - size_t len) +static struct dma_async_tx_descriptor * +ioat_dma_prep_memcpy(struct dma_chan *chan, size_t len, int int_en) { - struct ioat_desc_sw *first; - struct ioat_desc_sw *prev; - struct ioat_desc_sw *new; - dma_cookie_t cookie; + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + struct ioat_desc_sw *first, *prev, *new; LIST_HEAD(new_chain); u32 copy; size_t orig_len; - dma_addr_t orig_src, orig_dst; - unsigned int desc_count = 0; - unsigned int append = 0; - - if (!ioat_chan || !dest || !src) - return -EFAULT; + int desc_count = 0; if (!len) - return ioat_chan->common.cookie; + return NULL; orig_len = len; - orig_src = src; - orig_dst = dest; first = NULL; prev = NULL; spin_lock_bh(&ioat_chan->desc_lock); - while (len) { if (!list_empty(&ioat_chan->free_desc)) { new = to_ioat_desc(ioat_chan->free_desc.next); @@ -270,141 +324,36 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan, new->hw->size = copy; new->hw->ctl = 0; - new->hw->src_addr = src; - new->hw->dst_addr = dest; - new->cookie = 0; + new->async_tx.cookie = 0; + new->async_tx.ack = 1; /* chain together the physical address list for the HW */ if (!first) first = new; else - prev->hw->next = (u64) new->phys; + prev->hw->next = (u64) new->async_tx.phys; prev = new; - len -= copy; - dest += copy; - src += copy; - list_add_tail(&new->node, &new_chain); desc_count++; } + + list_splice(&new_chain, &new->async_tx.tx_list); + new->hw->ctl = IOAT_DMA_DESCRIPTOR_CTL_CP_STS; new->hw->next = 0; + new->tx_cnt = desc_count; + new->async_tx.ack = 0; /* client is in control of this ack */ + new->async_tx.cookie = -EBUSY; - /* cookie incr and addition to used_list must be atomic */ - - cookie = ioat_chan->common.cookie; - cookie++; - if (cookie < 0) - cookie = 1; - ioat_chan->common.cookie = new->cookie = cookie; - - pci_unmap_addr_set(new, src, orig_src); - pci_unmap_addr_set(new, dst, orig_dst); pci_unmap_len_set(new, src_len, orig_len); pci_unmap_len_set(new, dst_len, orig_len); - - /* write address into NextDescriptor field of last desc in chain */ - to_ioat_desc(ioat_chan->used_desc.prev)->hw->next = first->phys; - list_splice_init(&new_chain, ioat_chan->used_desc.prev); - - ioat_chan->pending += desc_count; - if (ioat_chan->pending >= 20) { - append = 1; - ioat_chan->pending = 0; - } - spin_unlock_bh(&ioat_chan->desc_lock); - if (append) - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); - return cookie; + return new ? &new->async_tx : NULL; } -/** - * ioat_dma_memcpy_buf_to_buf - wrapper that takes src & dest bufs - * @chan: IOAT DMA channel handle - * @dest: DMA destination address - * @src: DMA source address - * @len: transaction length in bytes - */ - -static dma_cookie_t ioat_dma_memcpy_buf_to_buf(struct dma_chan *chan, - void *dest, - void *src, - size_t len) -{ - dma_addr_t dest_addr; - dma_addr_t src_addr; - struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); - - dest_addr = pci_map_single(ioat_chan->device->pdev, - dest, len, PCI_DMA_FROMDEVICE); - src_addr = pci_map_single(ioat_chan->device->pdev, - src, len, PCI_DMA_TODEVICE); - - return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len); -} - -/** - * ioat_dma_memcpy_buf_to_pg - wrapper, copying from a buf to a page - * @chan: IOAT DMA channel handle - * @page: pointer to the page to copy to - * @offset: offset into that page - * @src: DMA source address - * @len: transaction length in bytes - */ - -static dma_cookie_t ioat_dma_memcpy_buf_to_pg(struct dma_chan *chan, - struct page *page, - unsigned int offset, - void *src, - size_t len) -{ - dma_addr_t dest_addr; - dma_addr_t src_addr; - struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); - - dest_addr = pci_map_page(ioat_chan->device->pdev, - page, offset, len, PCI_DMA_FROMDEVICE); - src_addr = pci_map_single(ioat_chan->device->pdev, - src, len, PCI_DMA_TODEVICE); - - return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len); -} - -/** - * ioat_dma_memcpy_pg_to_pg - wrapper, copying between two pages - * @chan: IOAT DMA channel handle - * @dest_pg: pointer to the page to copy to - * @dest_off: offset into that page - * @src_pg: pointer to the page to copy from - * @src_off: offset into that page - * @len: transaction length in bytes. This is guaranteed not to make a copy - * across a page boundary. - */ - -static dma_cookie_t ioat_dma_memcpy_pg_to_pg(struct dma_chan *chan, - struct page *dest_pg, - unsigned int dest_off, - struct page *src_pg, - unsigned int src_off, - size_t len) -{ - dma_addr_t dest_addr; - dma_addr_t src_addr; - struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); - - dest_addr = pci_map_page(ioat_chan->device->pdev, - dest_pg, dest_off, len, PCI_DMA_FROMDEVICE); - src_addr = pci_map_page(ioat_chan->device->pdev, - src_pg, src_off, len, PCI_DMA_TODEVICE); - - return do_ioat_dma_memcpy(ioat_chan, dest_addr, src_addr, len); -} /** * ioat_dma_memcpy_issue_pending - push potentially unrecognized appended descriptors to hw @@ -417,9 +366,8 @@ static void ioat_dma_memcpy_issue_pending(struct dma_chan *chan) if (ioat_chan->pending != 0) { ioat_chan->pending = 0; - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan->reg_base + IOAT_CHANCMD_OFFSET); } } @@ -449,7 +397,7 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan) if ((chan->completion_virt->full & IOAT_CHANSTS_DMA_TRANSFER_STATUS) == IOAT_CHANSTS_DMA_TRANSFER_STATUS_HALTED) { printk("IOAT: Channel halted, chanerr = %x\n", - ioatdma_chan_read32(chan, IOAT_CHANERR_OFFSET)); + readl(chan->reg_base + IOAT_CHANERR_OFFSET)); /* TODO do something to salvage the situation */ } @@ -467,8 +415,8 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan) * exceeding xfercap, perhaps. If so, only the last one will * have a cookie, and require unmapping. */ - if (desc->cookie) { - cookie = desc->cookie; + if (desc->async_tx.cookie) { + cookie = desc->async_tx.cookie; /* yes we are unmapping both _page and _single alloc'd regions with unmap_page. Is this *really* that bad? @@ -483,14 +431,19 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan) PCI_DMA_TODEVICE); } - if (desc->phys != phys_complete) { - /* a completed entry, but not the last, so cleanup */ - list_del(&desc->node); - list_add_tail(&desc->node, &chan->free_desc); + if (desc->async_tx.phys != phys_complete) { + /* a completed entry, but not the last, so cleanup + * if the client is done with the descriptor + */ + if (desc->async_tx.ack) { + list_del(&desc->node); + list_add_tail(&desc->node, &chan->free_desc); + } else + desc->async_tx.cookie = 0; } else { /* last used desc. Do not remove, so we can append from it, but don't look at it next time, either */ - desc->cookie = 0; + desc->async_tx.cookie = 0; /* TODO check status bits? */ break; @@ -506,6 +459,17 @@ static void ioat_dma_memcpy_cleanup(struct ioat_dma_chan *chan) spin_unlock(&chan->cleanup_lock); } +static void ioat_dma_dependency_added(struct dma_chan *chan) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + spin_lock_bh(&ioat_chan->desc_lock); + if (ioat_chan->pending == 0) { + spin_unlock_bh(&ioat_chan->desc_lock); + ioat_dma_memcpy_cleanup(ioat_chan); + } else + spin_unlock_bh(&ioat_chan->desc_lock); +} + /** * ioat_dma_is_complete - poll the status of a IOAT DMA transaction * @chan: IOAT DMA channel handle @@ -553,6 +517,8 @@ static enum dma_status ioat_dma_is_complete(struct dma_chan *chan, static struct pci_device_id ioat_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT) }, + { PCI_DEVICE(PCI_VENDOR_ID_UNISYS, + PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR) }, { 0, } }; @@ -560,6 +526,7 @@ static struct pci_driver ioat_pci_driver = { .name = "ioatdma", .id_table = ioat_pci_tbl, .probe = ioat_probe, + .shutdown = ioat_shutdown, .remove = __devexit_p(ioat_remove), }; @@ -569,21 +536,21 @@ static irqreturn_t ioat_do_interrupt(int irq, void *data) unsigned long attnstatus; u8 intrctrl; - intrctrl = ioatdma_read8(instance, IOAT_INTRCTRL_OFFSET); + intrctrl = readb(instance->reg_base + IOAT_INTRCTRL_OFFSET); if (!(intrctrl & IOAT_INTRCTRL_MASTER_INT_EN)) return IRQ_NONE; if (!(intrctrl & IOAT_INTRCTRL_INT_STATUS)) { - ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl); + writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET); return IRQ_NONE; } - attnstatus = ioatdma_read32(instance, IOAT_ATTNSTATUS_OFFSET); + attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET); printk(KERN_ERR "ioatdma error: interrupt! status %lx\n", attnstatus); - ioatdma_write8(instance, IOAT_INTRCTRL_OFFSET, intrctrl); + writeb(intrctrl, instance->reg_base + IOAT_INTRCTRL_OFFSET); return IRQ_HANDLED; } @@ -607,19 +574,17 @@ static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan) desc->hw->ctl = IOAT_DMA_DESCRIPTOR_NUL; desc->hw->next = 0; + desc->async_tx.ack = 1; list_add_tail(&desc->node, &ioat_chan->used_desc); spin_unlock_bh(&ioat_chan->desc_lock); -#if (BITS_PER_LONG == 64) - ioatdma_chan_write64(ioat_chan, IOAT_CHAINADDR_OFFSET, desc->phys); -#else - ioatdma_chan_write32(ioat_chan, - IOAT_CHAINADDR_OFFSET_LOW, - (u32) desc->phys); - ioatdma_chan_write32(ioat_chan, IOAT_CHAINADDR_OFFSET_HIGH, 0); -#endif - ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_START); + writel(((u64) desc->async_tx.phys) & 0x00000000FFFFFFFF, + ioat_chan->reg_base + IOAT_CHAINADDR_OFFSET_LOW); + writel(((u64) desc->async_tx.phys) >> 32, + ioat_chan->reg_base + IOAT_CHAINADDR_OFFSET_HIGH); + + writeb(IOAT_CHANCMD_START, ioat_chan->reg_base + IOAT_CHANCMD_OFFSET); } /* @@ -633,6 +598,8 @@ static int ioat_self_test(struct ioat_device *device) u8 *src; u8 *dest; struct dma_chan *dma_chan; + struct dma_async_tx_descriptor *tx; + dma_addr_t addr; dma_cookie_t cookie; int err = 0; @@ -658,7 +625,15 @@ static int ioat_self_test(struct ioat_device *device) goto out; } - cookie = ioat_dma_memcpy_buf_to_buf(dma_chan, dest, src, IOAT_TEST_SIZE); + tx = ioat_dma_prep_memcpy(dma_chan, IOAT_TEST_SIZE, 0); + async_tx_ack(tx); + addr = dma_map_single(dma_chan->device->dev, src, IOAT_TEST_SIZE, + DMA_TO_DEVICE); + ioat_set_src(addr, tx, 0); + addr = dma_map_single(dma_chan->device->dev, dest, IOAT_TEST_SIZE, + DMA_FROM_DEVICE); + ioat_set_dest(addr, tx, 0); + cookie = ioat_tx_submit(tx); ioat_dma_memcpy_issue_pending(dma_chan); msleep(1); @@ -748,19 +723,20 @@ static int __devinit ioat_probe(struct pci_dev *pdev, device->reg_base = reg_base; - ioatdma_write8(device, IOAT_INTRCTRL_OFFSET, IOAT_INTRCTRL_MASTER_INT_EN); + writeb(IOAT_INTRCTRL_MASTER_INT_EN, device->reg_base + IOAT_INTRCTRL_OFFSET); pci_set_master(pdev); INIT_LIST_HEAD(&device->common.channels); enumerate_dma_channels(device); + dma_cap_set(DMA_MEMCPY, device->common.cap_mask); device->common.device_alloc_chan_resources = ioat_dma_alloc_chan_resources; device->common.device_free_chan_resources = ioat_dma_free_chan_resources; - device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf; - device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg; - device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg; - device->common.device_memcpy_complete = ioat_dma_is_complete; - device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending; + device->common.device_prep_dma_memcpy = ioat_dma_prep_memcpy; + device->common.device_is_tx_complete = ioat_dma_is_complete; + device->common.device_issue_pending = ioat_dma_memcpy_issue_pending; + device->common.device_dependency_added = ioat_dma_dependency_added; + device->common.dev = &pdev->dev; printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n", device->common.chancnt); @@ -787,9 +763,20 @@ err_request_regions: err_set_dma_mask: pci_disable_device(pdev); err_enable_device: + + printk(KERN_ERR "Intel(R) I/OAT DMA Engine initialization failed\n"); + return err; } +static void ioat_shutdown(struct pci_dev *pdev) +{ + struct ioat_device *device; + device = pci_get_drvdata(pdev); + + dma_async_device_unregister(&device->common); +} + static void __devexit ioat_remove(struct pci_dev *pdev) { struct ioat_device *device; @@ -818,7 +805,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev) } /* MODULE API */ -MODULE_VERSION("1.7"); +MODULE_VERSION("1.9"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dma/ioatdma.h b/drivers/dma/ioatdma.h index 62b26a9be4c9..d3726478031a 100644 --- a/drivers/dma/ioatdma.h +++ b/drivers/dma/ioatdma.h @@ -30,9 +30,6 @@ #define IOAT_LOW_COMPLETION_MASK 0xffffffc0 -extern struct list_head dma_device_list; -extern struct list_head dma_client_list; - /** * struct ioat_device - internal representation of a IOAT device * @pdev: PCI-Express device @@ -105,21 +102,20 @@ struct ioat_dma_chan { /** * struct ioat_desc_sw - wrapper around hardware descriptor * @hw: hardware DMA descriptor - * @node: - * @cookie: - * @phys: + * @node: this descriptor will either be on the free list, + * or attached to a transaction list (async_tx.tx_list) + * @tx_cnt: number of descriptors required to complete the transaction + * @async_tx: the generic software descriptor for all engines */ - struct ioat_desc_sw { struct ioat_dma_descriptor *hw; struct list_head node; - dma_cookie_t cookie; - dma_addr_t phys; + int tx_cnt; DECLARE_PCI_UNMAP_ADDR(src) DECLARE_PCI_UNMAP_LEN(src_len) DECLARE_PCI_UNMAP_ADDR(dst) DECLARE_PCI_UNMAP_LEN(dst_len) + struct dma_async_tx_descriptor async_tx; }; #endif /* IOATDMA_H */ - diff --git a/drivers/dma/ioatdma_io.h b/drivers/dma/ioatdma_io.h deleted file mode 100644 index c0b4bf66c920..000000000000 --- a/drivers/dma/ioatdma_io.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#ifndef IOATDMA_IO_H -#define IOATDMA_IO_H - -#include - -/* - * device and per-channel MMIO register read and write functions - * this is a lot of anoying inline functions, but it's typesafe - */ - -static inline u8 ioatdma_read8(struct ioat_device *device, - unsigned int offset) -{ - return readb(device->reg_base + offset); -} - -static inline u16 ioatdma_read16(struct ioat_device *device, - unsigned int offset) -{ - return readw(device->reg_base + offset); -} - -static inline u32 ioatdma_read32(struct ioat_device *device, - unsigned int offset) -{ - return readl(device->reg_base + offset); -} - -static inline void ioatdma_write8(struct ioat_device *device, - unsigned int offset, u8 value) -{ - writeb(value, device->reg_base + offset); -} - -static inline void ioatdma_write16(struct ioat_device *device, - unsigned int offset, u16 value) -{ - writew(value, device->reg_base + offset); -} - -static inline void ioatdma_write32(struct ioat_device *device, - unsigned int offset, u32 value) -{ - writel(value, device->reg_base + offset); -} - -static inline u8 ioatdma_chan_read8(struct ioat_dma_chan *chan, - unsigned int offset) -{ - return readb(chan->reg_base + offset); -} - -static inline u16 ioatdma_chan_read16(struct ioat_dma_chan *chan, - unsigned int offset) -{ - return readw(chan->reg_base + offset); -} - -static inline u32 ioatdma_chan_read32(struct ioat_dma_chan *chan, - unsigned int offset) -{ - return readl(chan->reg_base + offset); -} - -static inline void ioatdma_chan_write8(struct ioat_dma_chan *chan, - unsigned int offset, u8 value) -{ - writeb(value, chan->reg_base + offset); -} - -static inline void ioatdma_chan_write16(struct ioat_dma_chan *chan, - unsigned int offset, u16 value) -{ - writew(value, chan->reg_base + offset); -} - -static inline void ioatdma_chan_write32(struct ioat_dma_chan *chan, - unsigned int offset, u32 value) -{ - writel(value, chan->reg_base + offset); -} - -#if (BITS_PER_LONG == 64) -static inline u64 ioatdma_chan_read64(struct ioat_dma_chan *chan, - unsigned int offset) -{ - return readq(chan->reg_base + offset); -} - -static inline void ioatdma_chan_write64(struct ioat_dma_chan *chan, - unsigned int offset, u64 value) -{ - writeq(value, chan->reg_base + offset); -} -#endif - -#endif /* IOATDMA_IO_H */ - diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c new file mode 100644 index 000000000000..5a1d426744d6 --- /dev/null +++ b/drivers/dma/iop-adma.c @@ -0,0 +1,1467 @@ +/* + * offload engine driver for the Intel Xscale series of i/o processors + * Copyright © 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +/* + * This driver supports the asynchrounous DMA copy and RAID engines available + * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common) +#define to_iop_adma_device(dev) \ + container_of(dev, struct iop_adma_device, common) +#define tx_to_iop_adma_slot(tx) \ + container_of(tx, struct iop_adma_desc_slot, async_tx) + +/** + * iop_adma_free_slots - flags descriptor slots for reuse + * @slot: Slot to free + * Caller must hold &iop_chan->lock while calling this function + */ +static void iop_adma_free_slots(struct iop_adma_desc_slot *slot) +{ + int stride = slot->slots_per_op; + + while (stride--) { + slot->slots_per_op = 0; + slot = list_entry(slot->slot_node.next, + struct iop_adma_desc_slot, + slot_node); + } +} + +static dma_cookie_t +iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *iop_chan, dma_cookie_t cookie) +{ + BUG_ON(desc->async_tx.cookie < 0); + spin_lock_bh(&desc->async_tx.lock); + if (desc->async_tx.cookie > 0) { + cookie = desc->async_tx.cookie; + desc->async_tx.cookie = 0; + + /* call the callback (must not sleep or submit new + * operations to this channel) + */ + if (desc->async_tx.callback) + desc->async_tx.callback( + desc->async_tx.callback_param); + + /* unmap dma addresses + * (unmap_single vs unmap_page?) + */ + if (desc->group_head && desc->unmap_len) { + struct iop_adma_desc_slot *unmap = desc->group_head; + struct device *dev = + &iop_chan->device->pdev->dev; + u32 len = unmap->unmap_len; + u32 src_cnt = unmap->unmap_src_cnt; + dma_addr_t addr = iop_desc_get_dest_addr(unmap, + iop_chan); + + dma_unmap_page(dev, addr, len, DMA_FROM_DEVICE); + while (src_cnt--) { + addr = iop_desc_get_src_addr(unmap, + iop_chan, + src_cnt); + dma_unmap_page(dev, addr, len, + DMA_TO_DEVICE); + } + desc->group_head = NULL; + } + } + + /* run dependent operations */ + async_tx_run_dependencies(&desc->async_tx); + spin_unlock_bh(&desc->async_tx.lock); + + return cookie; +} + +static int +iop_adma_clean_slot(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *iop_chan) +{ + /* the client is allowed to attach dependent operations + * until 'ack' is set + */ + if (!desc->async_tx.ack) + return 0; + + /* leave the last descriptor in the chain + * so we can append to it + */ + if (desc->chain_node.next == &iop_chan->chain) + return 1; + + dev_dbg(iop_chan->device->common.dev, + "\tfree slot: %d slots_per_op: %d\n", + desc->idx, desc->slots_per_op); + + list_del(&desc->chain_node); + iop_adma_free_slots(desc); + + return 0; +} + +static void __iop_adma_slot_cleanup(struct iop_adma_chan *iop_chan) +{ + struct iop_adma_desc_slot *iter, *_iter, *grp_start = NULL; + dma_cookie_t cookie = 0; + u32 current_desc = iop_chan_get_current_descriptor(iop_chan); + int busy = iop_chan_is_busy(iop_chan); + int seen_current = 0, slot_cnt = 0, slots_per_op = 0; + + dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__); + /* free completed slots from the chain starting with + * the oldest descriptor + */ + list_for_each_entry_safe(iter, _iter, &iop_chan->chain, + chain_node) { + pr_debug("\tcookie: %d slot: %d busy: %d " + "this_desc: %#x next_desc: %#x ack: %d\n", + iter->async_tx.cookie, iter->idx, busy, + iter->async_tx.phys, iop_desc_get_next_desc(iter), + iter->async_tx.ack); + prefetch(_iter); + prefetch(&_iter->async_tx); + + /* do not advance past the current descriptor loaded into the + * hardware channel, subsequent descriptors are either in + * process or have not been submitted + */ + if (seen_current) + break; + + /* stop the search if we reach the current descriptor and the + * channel is busy, or if it appears that the current descriptor + * needs to be re-read (i.e. has been appended to) + */ + if (iter->async_tx.phys == current_desc) { + BUG_ON(seen_current++); + if (busy || iop_desc_get_next_desc(iter)) + break; + } + + /* detect the start of a group transaction */ + if (!slot_cnt && !slots_per_op) { + slot_cnt = iter->slot_cnt; + slots_per_op = iter->slots_per_op; + if (slot_cnt <= slots_per_op) { + slot_cnt = 0; + slots_per_op = 0; + } + } + + if (slot_cnt) { + pr_debug("\tgroup++\n"); + if (!grp_start) + grp_start = iter; + slot_cnt -= slots_per_op; + } + + /* all the members of a group are complete */ + if (slots_per_op != 0 && slot_cnt == 0) { + struct iop_adma_desc_slot *grp_iter, *_grp_iter; + int end_of_chain = 0; + pr_debug("\tgroup end\n"); + + /* collect the total results */ + if (grp_start->xor_check_result) { + u32 zero_sum_result = 0; + slot_cnt = grp_start->slot_cnt; + grp_iter = grp_start; + + list_for_each_entry_from(grp_iter, + &iop_chan->chain, chain_node) { + zero_sum_result |= + iop_desc_get_zero_result(grp_iter); + pr_debug("\titer%d result: %d\n", + grp_iter->idx, zero_sum_result); + slot_cnt -= slots_per_op; + if (slot_cnt == 0) + break; + } + pr_debug("\tgrp_start->xor_check_result: %p\n", + grp_start->xor_check_result); + *grp_start->xor_check_result = zero_sum_result; + } + + /* clean up the group */ + slot_cnt = grp_start->slot_cnt; + grp_iter = grp_start; + list_for_each_entry_safe_from(grp_iter, _grp_iter, + &iop_chan->chain, chain_node) { + cookie = iop_adma_run_tx_complete_actions( + grp_iter, iop_chan, cookie); + + slot_cnt -= slots_per_op; + end_of_chain = iop_adma_clean_slot(grp_iter, + iop_chan); + + if (slot_cnt == 0 || end_of_chain) + break; + } + + /* the group should be complete at this point */ + BUG_ON(slot_cnt); + + slots_per_op = 0; + grp_start = NULL; + if (end_of_chain) + break; + else + continue; + } else if (slots_per_op) /* wait for group completion */ + continue; + + /* write back zero sum results (single descriptor case) */ + if (iter->xor_check_result && iter->async_tx.cookie) + *iter->xor_check_result = + iop_desc_get_zero_result(iter); + + cookie = iop_adma_run_tx_complete_actions( + iter, iop_chan, cookie); + + if (iop_adma_clean_slot(iter, iop_chan)) + break; + } + + BUG_ON(!seen_current); + + iop_chan_idle(busy, iop_chan); + + if (cookie > 0) { + iop_chan->completed_cookie = cookie; + pr_debug("\tcompleted cookie %d\n", cookie); + } +} + +static void +iop_adma_slot_cleanup(struct iop_adma_chan *iop_chan) +{ + spin_lock_bh(&iop_chan->lock); + __iop_adma_slot_cleanup(iop_chan); + spin_unlock_bh(&iop_chan->lock); +} + +static void iop_adma_tasklet(unsigned long data) +{ + struct iop_adma_chan *chan = (struct iop_adma_chan *) data; + __iop_adma_slot_cleanup(chan); +} + +static struct iop_adma_desc_slot * +iop_adma_alloc_slots(struct iop_adma_chan *iop_chan, int num_slots, + int slots_per_op) +{ + struct iop_adma_desc_slot *iter, *_iter, *alloc_start = NULL; + struct list_head chain = LIST_HEAD_INIT(chain); + int slots_found, retry = 0; + + /* start search from the last allocated descrtiptor + * if a contiguous allocation can not be found start searching + * from the beginning of the list + */ +retry: + slots_found = 0; + if (retry == 0) + iter = iop_chan->last_used; + else + iter = list_entry(&iop_chan->all_slots, + struct iop_adma_desc_slot, + slot_node); + + list_for_each_entry_safe_continue( + iter, _iter, &iop_chan->all_slots, slot_node) { + prefetch(_iter); + prefetch(&_iter->async_tx); + if (iter->slots_per_op) { + /* give up after finding the first busy slot + * on the second pass through the list + */ + if (retry) + break; + + slots_found = 0; + continue; + } + + /* start the allocation if the slot is correctly aligned */ + if (!slots_found++) { + if (iop_desc_is_aligned(iter, slots_per_op)) + alloc_start = iter; + else { + slots_found = 0; + continue; + } + } + + if (slots_found == num_slots) { + struct iop_adma_desc_slot *alloc_tail = NULL; + struct iop_adma_desc_slot *last_used = NULL; + iter = alloc_start; + while (num_slots) { + int i; + dev_dbg(iop_chan->device->common.dev, + "allocated slot: %d " + "(desc %p phys: %#x) slots_per_op %d\n", + iter->idx, iter->hw_desc, + iter->async_tx.phys, slots_per_op); + + /* pre-ack all but the last descriptor */ + if (num_slots != slots_per_op) + iter->async_tx.ack = 1; + else + iter->async_tx.ack = 0; + + list_add_tail(&iter->chain_node, &chain); + alloc_tail = iter; + iter->async_tx.cookie = 0; + iter->slot_cnt = num_slots; + iter->xor_check_result = NULL; + for (i = 0; i < slots_per_op; i++) { + iter->slots_per_op = slots_per_op - i; + last_used = iter; + iter = list_entry(iter->slot_node.next, + struct iop_adma_desc_slot, + slot_node); + } + num_slots -= slots_per_op; + } + alloc_tail->group_head = alloc_start; + alloc_tail->async_tx.cookie = -EBUSY; + list_splice(&chain, &alloc_tail->async_tx.tx_list); + iop_chan->last_used = last_used; + iop_desc_clear_next_desc(alloc_start); + iop_desc_clear_next_desc(alloc_tail); + return alloc_tail; + } + } + if (!retry++) + goto retry; + + /* try to free some slots if the allocation fails */ + tasklet_schedule(&iop_chan->irq_tasklet); + + return NULL; +} + +static dma_cookie_t +iop_desc_assign_cookie(struct iop_adma_chan *iop_chan, + struct iop_adma_desc_slot *desc) +{ + dma_cookie_t cookie = iop_chan->common.cookie; + cookie++; + if (cookie < 0) + cookie = 1; + iop_chan->common.cookie = desc->async_tx.cookie = cookie; + return cookie; +} + +static void iop_adma_check_threshold(struct iop_adma_chan *iop_chan) +{ + dev_dbg(iop_chan->device->common.dev, "pending: %d\n", + iop_chan->pending); + + if (iop_chan->pending >= IOP_ADMA_THRESHOLD) { + iop_chan->pending = 0; + iop_chan_append(iop_chan); + } +} + +static dma_cookie_t +iop_adma_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx); + struct iop_adma_chan *iop_chan = to_iop_adma_chan(tx->chan); + struct iop_adma_desc_slot *grp_start, *old_chain_tail; + int slot_cnt; + int slots_per_op; + dma_cookie_t cookie; + + grp_start = sw_desc->group_head; + slot_cnt = grp_start->slot_cnt; + slots_per_op = grp_start->slots_per_op; + + spin_lock_bh(&iop_chan->lock); + cookie = iop_desc_assign_cookie(iop_chan, sw_desc); + + old_chain_tail = list_entry(iop_chan->chain.prev, + struct iop_adma_desc_slot, chain_node); + list_splice_init(&sw_desc->async_tx.tx_list, + &old_chain_tail->chain_node); + + /* fix up the hardware chain */ + iop_desc_set_next_desc(old_chain_tail, grp_start->async_tx.phys); + + /* 1/ don't add pre-chained descriptors + * 2/ dummy read to flush next_desc write + */ + BUG_ON(iop_desc_get_next_desc(sw_desc)); + + /* increment the pending count by the number of slots + * memcpy operations have a 1:1 (slot:operation) relation + * other operations are heavier and will pop the threshold + * more often. + */ + iop_chan->pending += slot_cnt; + iop_adma_check_threshold(iop_chan); + spin_unlock_bh(&iop_chan->lock); + + dev_dbg(iop_chan->device->common.dev, "%s cookie: %d slot: %d\n", + __FUNCTION__, sw_desc->async_tx.cookie, sw_desc->idx); + + return cookie; +} + +static void +iop_adma_set_dest(dma_addr_t addr, struct dma_async_tx_descriptor *tx, + int index) +{ + struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx); + struct iop_adma_chan *iop_chan = to_iop_adma_chan(tx->chan); + + /* to do: support transfers lengths > IOP_ADMA_MAX_BYTE_COUNT */ + iop_desc_set_dest_addr(sw_desc->group_head, iop_chan, addr); +} + +static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan); +static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan); + +/* returns the number of allocated descriptors */ +static int iop_adma_alloc_chan_resources(struct dma_chan *chan) +{ + char *hw_desc; + int idx; + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *slot = NULL; + int init = iop_chan->slots_allocated ? 0 : 1; + struct iop_adma_platform_data *plat_data = + iop_chan->device->pdev->dev.platform_data; + int num_descs_in_pool = plat_data->pool_size/IOP_ADMA_SLOT_SIZE; + + /* Allocate descriptor slots */ + do { + idx = iop_chan->slots_allocated; + if (idx == num_descs_in_pool) + break; + + slot = kzalloc(sizeof(*slot), GFP_KERNEL); + if (!slot) { + printk(KERN_INFO "IOP ADMA Channel only initialized" + " %d descriptor slots", idx); + break; + } + hw_desc = (char *) iop_chan->device->dma_desc_pool_virt; + slot->hw_desc = (void *) &hw_desc[idx * IOP_ADMA_SLOT_SIZE]; + + dma_async_tx_descriptor_init(&slot->async_tx, chan); + slot->async_tx.tx_submit = iop_adma_tx_submit; + slot->async_tx.tx_set_dest = iop_adma_set_dest; + INIT_LIST_HEAD(&slot->chain_node); + INIT_LIST_HEAD(&slot->slot_node); + INIT_LIST_HEAD(&slot->async_tx.tx_list); + hw_desc = (char *) iop_chan->device->dma_desc_pool; + slot->async_tx.phys = + (dma_addr_t) &hw_desc[idx * IOP_ADMA_SLOT_SIZE]; + slot->idx = idx; + + spin_lock_bh(&iop_chan->lock); + iop_chan->slots_allocated++; + list_add_tail(&slot->slot_node, &iop_chan->all_slots); + spin_unlock_bh(&iop_chan->lock); + } while (iop_chan->slots_allocated < num_descs_in_pool); + + if (idx && !iop_chan->last_used) + iop_chan->last_used = list_entry(iop_chan->all_slots.next, + struct iop_adma_desc_slot, + slot_node); + + dev_dbg(iop_chan->device->common.dev, + "allocated %d descriptor slots last_used: %p\n", + iop_chan->slots_allocated, iop_chan->last_used); + + /* initialize the channel and the chain with a null operation */ + if (init) { + if (dma_has_cap(DMA_MEMCPY, + iop_chan->device->common.cap_mask)) + iop_chan_start_null_memcpy(iop_chan); + else if (dma_has_cap(DMA_XOR, + iop_chan->device->common.cap_mask)) + iop_chan_start_null_xor(iop_chan); + else + BUG(); + } + + return (idx > 0) ? idx : -ENOMEM; +} + +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_interrupt(struct dma_chan *chan) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *grp_start; + int slot_cnt, slots_per_op; + + dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_interrupt_slot_count(&slots_per_op, iop_chan); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + iop_desc_init_interrupt(grp_start, iop_chan); + grp_start->unmap_len = 0; + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + +static void +iop_adma_memcpy_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx, + int index) +{ + struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx); + struct iop_adma_desc_slot *grp_start = sw_desc->group_head; + + iop_desc_set_memcpy_src_addr(grp_start, addr); +} + +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_memcpy(struct dma_chan *chan, size_t len, int int_en) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *grp_start; + int slot_cnt, slots_per_op; + + if (unlikely(!len)) + return NULL; + BUG_ON(unlikely(len > IOP_ADMA_MAX_BYTE_COUNT)); + + dev_dbg(iop_chan->device->common.dev, "%s len: %u\n", + __FUNCTION__, len); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_memcpy_slot_count(len, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + iop_desc_init_memcpy(grp_start, int_en); + iop_desc_set_byte_count(grp_start, iop_chan, len); + sw_desc->unmap_src_cnt = 1; + sw_desc->unmap_len = len; + sw_desc->async_tx.tx_set_src = iop_adma_memcpy_set_src; + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_memset(struct dma_chan *chan, int value, size_t len, + int int_en) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *grp_start; + int slot_cnt, slots_per_op; + + if (unlikely(!len)) + return NULL; + BUG_ON(unlikely(len > IOP_ADMA_MAX_BYTE_COUNT)); + + dev_dbg(iop_chan->device->common.dev, "%s len: %u\n", + __FUNCTION__, len); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_memset_slot_count(len, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + iop_desc_init_memset(grp_start, int_en); + iop_desc_set_byte_count(grp_start, iop_chan, len); + iop_desc_set_block_fill_val(grp_start, value); + sw_desc->unmap_src_cnt = 1; + sw_desc->unmap_len = len; + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + +static void +iop_adma_xor_set_src(dma_addr_t addr, struct dma_async_tx_descriptor *tx, + int index) +{ + struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx); + struct iop_adma_desc_slot *grp_start = sw_desc->group_head; + + iop_desc_set_xor_src_addr(grp_start, index, addr); +} + +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_xor(struct dma_chan *chan, unsigned int src_cnt, size_t len, + int int_en) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *grp_start; + int slot_cnt, slots_per_op; + + if (unlikely(!len)) + return NULL; + BUG_ON(unlikely(len > IOP_ADMA_XOR_MAX_BYTE_COUNT)); + + dev_dbg(iop_chan->device->common.dev, + "%s src_cnt: %d len: %u int_en: %d\n", + __FUNCTION__, src_cnt, len, int_en); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_xor_slot_count(len, src_cnt, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + iop_desc_init_xor(grp_start, src_cnt, int_en); + iop_desc_set_byte_count(grp_start, iop_chan, len); + sw_desc->unmap_src_cnt = src_cnt; + sw_desc->unmap_len = len; + sw_desc->async_tx.tx_set_src = iop_adma_xor_set_src; + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + +static void +iop_adma_xor_zero_sum_set_src(dma_addr_t addr, + struct dma_async_tx_descriptor *tx, + int index) +{ + struct iop_adma_desc_slot *sw_desc = tx_to_iop_adma_slot(tx); + struct iop_adma_desc_slot *grp_start = sw_desc->group_head; + + iop_desc_set_zero_sum_src_addr(grp_start, index, addr); +} + +static struct dma_async_tx_descriptor * +iop_adma_prep_dma_zero_sum(struct dma_chan *chan, unsigned int src_cnt, + size_t len, u32 *result, int int_en) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *sw_desc, *grp_start; + int slot_cnt, slots_per_op; + + if (unlikely(!len)) + return NULL; + + dev_dbg(iop_chan->device->common.dev, "%s src_cnt: %d len: %u\n", + __FUNCTION__, src_cnt, len); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_zero_sum_slot_count(len, src_cnt, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + iop_desc_init_zero_sum(grp_start, src_cnt, int_en); + iop_desc_set_zero_sum_byte_count(grp_start, len); + grp_start->xor_check_result = result; + pr_debug("\t%s: grp_start->xor_check_result: %p\n", + __FUNCTION__, grp_start->xor_check_result); + sw_desc->unmap_src_cnt = src_cnt; + sw_desc->unmap_len = len; + sw_desc->async_tx.tx_set_src = iop_adma_xor_zero_sum_set_src; + } + spin_unlock_bh(&iop_chan->lock); + + return sw_desc ? &sw_desc->async_tx : NULL; +} + +static void iop_adma_dependency_added(struct dma_chan *chan) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + tasklet_schedule(&iop_chan->irq_tasklet); +} + +static void iop_adma_free_chan_resources(struct dma_chan *chan) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + struct iop_adma_desc_slot *iter, *_iter; + int in_use_descs = 0; + + iop_adma_slot_cleanup(iop_chan); + + spin_lock_bh(&iop_chan->lock); + list_for_each_entry_safe(iter, _iter, &iop_chan->chain, + chain_node) { + in_use_descs++; + list_del(&iter->chain_node); + } + list_for_each_entry_safe_reverse( + iter, _iter, &iop_chan->all_slots, slot_node) { + list_del(&iter->slot_node); + kfree(iter); + iop_chan->slots_allocated--; + } + iop_chan->last_used = NULL; + + dev_dbg(iop_chan->device->common.dev, "%s slots_allocated %d\n", + __FUNCTION__, iop_chan->slots_allocated); + spin_unlock_bh(&iop_chan->lock); + + /* one is ok since we left it on there on purpose */ + if (in_use_descs > 1) + printk(KERN_ERR "IOP: Freeing %d in use descriptors!\n", + in_use_descs - 1); +} + +/** + * iop_adma_is_complete - poll the status of an ADMA transaction + * @chan: ADMA channel handle + * @cookie: ADMA transaction identifier + */ +static enum dma_status iop_adma_is_complete(struct dma_chan *chan, + dma_cookie_t cookie, + dma_cookie_t *done, + dma_cookie_t *used) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + dma_cookie_t last_used; + dma_cookie_t last_complete; + enum dma_status ret; + + last_used = chan->cookie; + last_complete = iop_chan->completed_cookie; + + if (done) + *done = last_complete; + if (used) + *used = last_used; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + if (ret == DMA_SUCCESS) + return ret; + + iop_adma_slot_cleanup(iop_chan); + + last_used = chan->cookie; + last_complete = iop_chan->completed_cookie; + + if (done) + *done = last_complete; + if (used) + *used = last_used; + + return dma_async_is_complete(cookie, last_complete, last_used); +} + +static irqreturn_t iop_adma_eot_handler(int irq, void *data) +{ + struct iop_adma_chan *chan = data; + + dev_dbg(chan->device->common.dev, "%s\n", __FUNCTION__); + + tasklet_schedule(&chan->irq_tasklet); + + iop_adma_device_clear_eot_status(chan); + + return IRQ_HANDLED; +} + +static irqreturn_t iop_adma_eoc_handler(int irq, void *data) +{ + struct iop_adma_chan *chan = data; + + dev_dbg(chan->device->common.dev, "%s\n", __FUNCTION__); + + tasklet_schedule(&chan->irq_tasklet); + + iop_adma_device_clear_eoc_status(chan); + + return IRQ_HANDLED; +} + +static irqreturn_t iop_adma_err_handler(int irq, void *data) +{ + struct iop_adma_chan *chan = data; + unsigned long status = iop_chan_get_status(chan); + + dev_printk(KERN_ERR, chan->device->common.dev, + "error ( %s%s%s%s%s%s%s)\n", + iop_is_err_int_parity(status, chan) ? "int_parity " : "", + iop_is_err_mcu_abort(status, chan) ? "mcu_abort " : "", + iop_is_err_int_tabort(status, chan) ? "int_tabort " : "", + iop_is_err_int_mabort(status, chan) ? "int_mabort " : "", + iop_is_err_pci_tabort(status, chan) ? "pci_tabort " : "", + iop_is_err_pci_mabort(status, chan) ? "pci_mabort " : "", + iop_is_err_split_tx(status, chan) ? "split_tx " : ""); + + iop_adma_device_clear_err_status(chan); + + BUG(); + + return IRQ_HANDLED; +} + +static void iop_adma_issue_pending(struct dma_chan *chan) +{ + struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan); + + if (iop_chan->pending) { + iop_chan->pending = 0; + iop_chan_append(iop_chan); + } +} + +/* + * Perform a transaction to verify the HW works. + */ +#define IOP_ADMA_TEST_SIZE 2000 + +static int __devinit iop_adma_memcpy_self_test(struct iop_adma_device *device) +{ + int i; + void *src, *dest; + dma_addr_t src_dma, dest_dma; + struct dma_chan *dma_chan; + dma_cookie_t cookie; + struct dma_async_tx_descriptor *tx; + int err = 0; + struct iop_adma_chan *iop_chan; + + dev_dbg(device->common.dev, "%s\n", __FUNCTION__); + + src = kzalloc(sizeof(u8) * IOP_ADMA_TEST_SIZE, GFP_KERNEL); + if (!src) + return -ENOMEM; + dest = kzalloc(sizeof(u8) * IOP_ADMA_TEST_SIZE, GFP_KERNEL); + if (!dest) { + kfree(src); + return -ENOMEM; + } + + /* Fill in src buffer */ + for (i = 0; i < IOP_ADMA_TEST_SIZE; i++) + ((u8 *) src)[i] = (u8)i; + + memset(dest, 0, IOP_ADMA_TEST_SIZE); + + /* Start copy, using first DMA channel */ + dma_chan = container_of(device->common.channels.next, + struct dma_chan, + device_node); + if (iop_adma_alloc_chan_resources(dma_chan) < 1) { + err = -ENODEV; + goto out; + } + + tx = iop_adma_prep_dma_memcpy(dma_chan, IOP_ADMA_TEST_SIZE, 1); + dest_dma = dma_map_single(dma_chan->device->dev, dest, + IOP_ADMA_TEST_SIZE, DMA_FROM_DEVICE); + iop_adma_set_dest(dest_dma, tx, 0); + src_dma = dma_map_single(dma_chan->device->dev, src, + IOP_ADMA_TEST_SIZE, DMA_TO_DEVICE); + iop_adma_memcpy_set_src(src_dma, tx, 0); + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + async_tx_ack(tx); + msleep(1); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != + DMA_SUCCESS) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test copy timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + iop_chan = to_iop_adma_chan(dma_chan); + dma_sync_single_for_cpu(&iop_chan->device->pdev->dev, dest_dma, + IOP_ADMA_TEST_SIZE, DMA_FROM_DEVICE); + if (memcmp(src, dest, IOP_ADMA_TEST_SIZE)) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test copy failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + +free_resources: + iop_adma_free_chan_resources(dma_chan); +out: + kfree(src); + kfree(dest); + return err; +} + +#define IOP_ADMA_NUM_SRC_TEST 4 /* must be <= 15 */ +static int __devinit +iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) +{ + int i, src_idx; + struct page *dest; + struct page *xor_srcs[IOP_ADMA_NUM_SRC_TEST]; + struct page *zero_sum_srcs[IOP_ADMA_NUM_SRC_TEST + 1]; + dma_addr_t dma_addr, dest_dma; + struct dma_async_tx_descriptor *tx; + struct dma_chan *dma_chan; + dma_cookie_t cookie; + u8 cmp_byte = 0; + u32 cmp_word; + u32 zero_sum_result; + int err = 0; + struct iop_adma_chan *iop_chan; + + dev_dbg(device->common.dev, "%s\n", __FUNCTION__); + + for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) { + xor_srcs[src_idx] = alloc_page(GFP_KERNEL); + if (!xor_srcs[src_idx]) + while (src_idx--) { + __free_page(xor_srcs[src_idx]); + return -ENOMEM; + } + } + + dest = alloc_page(GFP_KERNEL); + if (!dest) + while (src_idx--) { + __free_page(xor_srcs[src_idx]); + return -ENOMEM; + } + + /* Fill in src buffers */ + for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) { + u8 *ptr = page_address(xor_srcs[src_idx]); + for (i = 0; i < PAGE_SIZE; i++) + ptr[i] = (1 << src_idx); + } + + for (src_idx = 0; src_idx < IOP_ADMA_NUM_SRC_TEST; src_idx++) + cmp_byte ^= (u8) (1 << src_idx); + + cmp_word = (cmp_byte << 24) | (cmp_byte << 16) | + (cmp_byte << 8) | cmp_byte; + + memset(page_address(dest), 0, PAGE_SIZE); + + dma_chan = container_of(device->common.channels.next, + struct dma_chan, + device_node); + if (iop_adma_alloc_chan_resources(dma_chan) < 1) { + err = -ENODEV; + goto out; + } + + /* test xor */ + tx = iop_adma_prep_dma_xor(dma_chan, IOP_ADMA_NUM_SRC_TEST, + PAGE_SIZE, 1); + dest_dma = dma_map_page(dma_chan->device->dev, dest, 0, + PAGE_SIZE, DMA_FROM_DEVICE); + iop_adma_set_dest(dest_dma, tx, 0); + + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) { + dma_addr = dma_map_page(dma_chan->device->dev, xor_srcs[i], 0, + PAGE_SIZE, DMA_TO_DEVICE); + iop_adma_xor_set_src(dma_addr, tx, i); + } + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + async_tx_ack(tx); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != + DMA_SUCCESS) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test xor timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + iop_chan = to_iop_adma_chan(dma_chan); + dma_sync_single_for_cpu(&iop_chan->device->pdev->dev, dest_dma, + PAGE_SIZE, DMA_FROM_DEVICE); + for (i = 0; i < (PAGE_SIZE / sizeof(u32)); i++) { + u32 *ptr = page_address(dest); + if (ptr[i] != cmp_word) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test xor failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + } + dma_sync_single_for_device(&iop_chan->device->pdev->dev, dest_dma, + PAGE_SIZE, DMA_TO_DEVICE); + + /* skip zero sum if the capability is not present */ + if (!dma_has_cap(DMA_ZERO_SUM, dma_chan->device->cap_mask)) + goto free_resources; + + /* zero sum the sources with the destintation page */ + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST; i++) + zero_sum_srcs[i] = xor_srcs[i]; + zero_sum_srcs[i] = dest; + + zero_sum_result = 1; + + tx = iop_adma_prep_dma_zero_sum(dma_chan, IOP_ADMA_NUM_SRC_TEST + 1, + PAGE_SIZE, &zero_sum_result, 1); + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 1; i++) { + dma_addr = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i], + 0, PAGE_SIZE, DMA_TO_DEVICE); + iop_adma_xor_zero_sum_set_src(dma_addr, tx, i); + } + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + async_tx_ack(tx); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test zero sum timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + if (zero_sum_result != 0) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test zero sum failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + /* test memset */ + tx = iop_adma_prep_dma_memset(dma_chan, 0, PAGE_SIZE, 1); + dma_addr = dma_map_page(dma_chan->device->dev, dest, 0, + PAGE_SIZE, DMA_FROM_DEVICE); + iop_adma_set_dest(dma_addr, tx, 0); + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + async_tx_ack(tx); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test memset timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) { + u32 *ptr = page_address(dest); + if (ptr[i]) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test memset failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + } + + /* test for non-zero parity sum */ + zero_sum_result = 0; + tx = iop_adma_prep_dma_zero_sum(dma_chan, IOP_ADMA_NUM_SRC_TEST + 1, + PAGE_SIZE, &zero_sum_result, 1); + for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 1; i++) { + dma_addr = dma_map_page(dma_chan->device->dev, zero_sum_srcs[i], + 0, PAGE_SIZE, DMA_TO_DEVICE); + iop_adma_xor_zero_sum_set_src(dma_addr, tx, i); + } + + cookie = iop_adma_tx_submit(tx); + iop_adma_issue_pending(dma_chan); + async_tx_ack(tx); + msleep(8); + + if (iop_adma_is_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test non-zero sum timed out, disabling\n"); + err = -ENODEV; + goto free_resources; + } + + if (zero_sum_result != 1) { + dev_printk(KERN_ERR, dma_chan->device->dev, + "Self-test non-zero sum failed compare, disabling\n"); + err = -ENODEV; + goto free_resources; + } + +free_resources: + iop_adma_free_chan_resources(dma_chan); +out: + src_idx = IOP_ADMA_NUM_SRC_TEST; + while (src_idx--) + __free_page(xor_srcs[src_idx]); + __free_page(dest); + return err; +} + +static int __devexit iop_adma_remove(struct platform_device *dev) +{ + struct iop_adma_device *device = platform_get_drvdata(dev); + struct dma_chan *chan, *_chan; + struct iop_adma_chan *iop_chan; + int i; + struct iop_adma_platform_data *plat_data = dev->dev.platform_data; + + dma_async_device_unregister(&device->common); + + for (i = 0; i < 3; i++) { + unsigned int irq; + irq = platform_get_irq(dev, i); + free_irq(irq, device); + } + + dma_free_coherent(&dev->dev, plat_data->pool_size, + device->dma_desc_pool_virt, device->dma_desc_pool); + + do { + struct resource *res; + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + release_mem_region(res->start, res->end - res->start); + } while (0); + + list_for_each_entry_safe(chan, _chan, &device->common.channels, + device_node) { + iop_chan = to_iop_adma_chan(chan); + list_del(&chan->device_node); + kfree(iop_chan); + } + kfree(device); + + return 0; +} + +static int __devinit iop_adma_probe(struct platform_device *pdev) +{ + struct resource *res; + int ret = 0, i; + struct iop_adma_device *adev; + struct iop_adma_chan *iop_chan; + struct dma_device *dma_dev; + struct iop_adma_platform_data *plat_data = pdev->dev.platform_data; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + + if (!devm_request_mem_region(&pdev->dev, res->start, + res->end - res->start, pdev->name)) + return -EBUSY; + + adev = kzalloc(sizeof(*adev), GFP_KERNEL); + if (!adev) + return -ENOMEM; + dma_dev = &adev->common; + + /* allocate coherent memory for hardware descriptors + * note: writecombine gives slightly better performance, but + * requires that we explicitly flush the writes + */ + if ((adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev, + plat_data->pool_size, + &adev->dma_desc_pool, + GFP_KERNEL)) == NULL) { + ret = -ENOMEM; + goto err_free_adev; + } + + dev_dbg(&pdev->dev, "%s: allocted descriptor pool virt %p phys %p\n", + __FUNCTION__, adev->dma_desc_pool_virt, + (void *) adev->dma_desc_pool); + + adev->id = plat_data->hw_id; + + /* discover transaction capabilites from the platform data */ + dma_dev->cap_mask = plat_data->cap_mask; + + adev->pdev = pdev; + platform_set_drvdata(pdev, adev); + + INIT_LIST_HEAD(&dma_dev->channels); + + /* set base routines */ + dma_dev->device_alloc_chan_resources = iop_adma_alloc_chan_resources; + dma_dev->device_free_chan_resources = iop_adma_free_chan_resources; + dma_dev->device_is_tx_complete = iop_adma_is_complete; + dma_dev->device_issue_pending = iop_adma_issue_pending; + dma_dev->device_dependency_added = iop_adma_dependency_added; + dma_dev->dev = &pdev->dev; + + /* set prep routines based on capability */ + if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) + dma_dev->device_prep_dma_memcpy = iop_adma_prep_dma_memcpy; + if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) + dma_dev->device_prep_dma_memset = iop_adma_prep_dma_memset; + if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { + dma_dev->max_xor = iop_adma_get_max_xor(); + dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor; + } + if (dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask)) + dma_dev->device_prep_dma_zero_sum = + iop_adma_prep_dma_zero_sum; + if (dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask)) + dma_dev->device_prep_dma_interrupt = + iop_adma_prep_dma_interrupt; + + iop_chan = kzalloc(sizeof(*iop_chan), GFP_KERNEL); + if (!iop_chan) { + ret = -ENOMEM; + goto err_free_dma; + } + iop_chan->device = adev; + + iop_chan->mmr_base = devm_ioremap(&pdev->dev, res->start, + res->end - res->start); + if (!iop_chan->mmr_base) { + ret = -ENOMEM; + goto err_free_iop_chan; + } + tasklet_init(&iop_chan->irq_tasklet, iop_adma_tasklet, (unsigned long) + iop_chan); + + /* clear errors before enabling interrupts */ + iop_adma_device_clear_err_status(iop_chan); + + for (i = 0; i < 3; i++) { + irq_handler_t handler[] = { iop_adma_eot_handler, + iop_adma_eoc_handler, + iop_adma_err_handler }; + int irq = platform_get_irq(pdev, i); + if (irq < 0) { + ret = -ENXIO; + goto err_free_iop_chan; + } else { + ret = devm_request_irq(&pdev->dev, irq, + handler[i], 0, pdev->name, iop_chan); + if (ret) + goto err_free_iop_chan; + } + } + + spin_lock_init(&iop_chan->lock); + init_timer(&iop_chan->cleanup_watchdog); + iop_chan->cleanup_watchdog.data = (unsigned long) iop_chan; + iop_chan->cleanup_watchdog.function = iop_adma_tasklet; + INIT_LIST_HEAD(&iop_chan->chain); + INIT_LIST_HEAD(&iop_chan->all_slots); + INIT_RCU_HEAD(&iop_chan->common.rcu); + iop_chan->common.device = dma_dev; + list_add_tail(&iop_chan->common.device_node, &dma_dev->channels); + + if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { + ret = iop_adma_memcpy_self_test(adev); + dev_dbg(&pdev->dev, "memcpy self test returned %d\n", ret); + if (ret) + goto err_free_iop_chan; + } + + if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) || + dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) { + ret = iop_adma_xor_zero_sum_self_test(adev); + dev_dbg(&pdev->dev, "xor self test returned %d\n", ret); + if (ret) + goto err_free_iop_chan; + } + + dev_printk(KERN_INFO, &pdev->dev, "Intel(R) IOP: " + "( %s%s%s%s%s%s%s%s%s%s)\n", + dma_has_cap(DMA_PQ_XOR, dma_dev->cap_mask) ? "pq_xor " : "", + dma_has_cap(DMA_PQ_UPDATE, dma_dev->cap_mask) ? "pq_update " : "", + dma_has_cap(DMA_PQ_ZERO_SUM, dma_dev->cap_mask) ? "pq_zero_sum " : "", + dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "", + dma_has_cap(DMA_DUAL_XOR, dma_dev->cap_mask) ? "dual_xor " : "", + dma_has_cap(DMA_ZERO_SUM, dma_dev->cap_mask) ? "xor_zero_sum " : "", + dma_has_cap(DMA_MEMSET, dma_dev->cap_mask) ? "fill " : "", + dma_has_cap(DMA_MEMCPY_CRC32C, dma_dev->cap_mask) ? "cpy+crc " : "", + dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "", + dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : ""); + + dma_async_device_register(dma_dev); + goto out; + + err_free_iop_chan: + kfree(iop_chan); + err_free_dma: + dma_free_coherent(&adev->pdev->dev, plat_data->pool_size, + adev->dma_desc_pool_virt, adev->dma_desc_pool); + err_free_adev: + kfree(adev); + out: + return ret; +} + +static void iop_chan_start_null_memcpy(struct iop_adma_chan *iop_chan) +{ + struct iop_adma_desc_slot *sw_desc, *grp_start; + dma_cookie_t cookie; + int slot_cnt, slots_per_op; + + dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_memcpy_slot_count(0, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + + list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain); + sw_desc->async_tx.ack = 1; + iop_desc_init_memcpy(grp_start, 0); + iop_desc_set_byte_count(grp_start, iop_chan, 0); + iop_desc_set_dest_addr(grp_start, iop_chan, 0); + iop_desc_set_memcpy_src_addr(grp_start, 0); + + cookie = iop_chan->common.cookie; + cookie++; + if (cookie <= 1) + cookie = 2; + + /* initialize the completed cookie to be less than + * the most recently used cookie + */ + iop_chan->completed_cookie = cookie - 1; + iop_chan->common.cookie = sw_desc->async_tx.cookie = cookie; + + /* channel should not be busy */ + BUG_ON(iop_chan_is_busy(iop_chan)); + + /* clear any prior error-status bits */ + iop_adma_device_clear_err_status(iop_chan); + + /* disable operation */ + iop_chan_disable(iop_chan); + + /* set the descriptor address */ + iop_chan_set_next_descriptor(iop_chan, sw_desc->async_tx.phys); + + /* 1/ don't add pre-chained descriptors + * 2/ dummy read to flush next_desc write + */ + BUG_ON(iop_desc_get_next_desc(sw_desc)); + + /* run the descriptor */ + iop_chan_enable(iop_chan); + } else + dev_printk(KERN_ERR, iop_chan->device->common.dev, + "failed to allocate null descriptor\n"); + spin_unlock_bh(&iop_chan->lock); +} + +static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan) +{ + struct iop_adma_desc_slot *sw_desc, *grp_start; + dma_cookie_t cookie; + int slot_cnt, slots_per_op; + + dev_dbg(iop_chan->device->common.dev, "%s\n", __FUNCTION__); + + spin_lock_bh(&iop_chan->lock); + slot_cnt = iop_chan_xor_slot_count(0, 2, &slots_per_op); + sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op); + if (sw_desc) { + grp_start = sw_desc->group_head; + list_splice_init(&sw_desc->async_tx.tx_list, &iop_chan->chain); + sw_desc->async_tx.ack = 1; + iop_desc_init_null_xor(grp_start, 2, 0); + iop_desc_set_byte_count(grp_start, iop_chan, 0); + iop_desc_set_dest_addr(grp_start, iop_chan, 0); + iop_desc_set_xor_src_addr(grp_start, 0, 0); + iop_desc_set_xor_src_addr(grp_start, 1, 0); + + cookie = iop_chan->common.cookie; + cookie++; + if (cookie <= 1) + cookie = 2; + + /* initialize the completed cookie to be less than + * the most recently used cookie + */ + iop_chan->completed_cookie = cookie - 1; + iop_chan->common.cookie = sw_desc->async_tx.cookie = cookie; + + /* channel should not be busy */ + BUG_ON(iop_chan_is_busy(iop_chan)); + + /* clear any prior error-status bits */ + iop_adma_device_clear_err_status(iop_chan); + + /* disable operation */ + iop_chan_disable(iop_chan); + + /* set the descriptor address */ + iop_chan_set_next_descriptor(iop_chan, sw_desc->async_tx.phys); + + /* 1/ don't add pre-chained descriptors + * 2/ dummy read to flush next_desc write + */ + BUG_ON(iop_desc_get_next_desc(sw_desc)); + + /* run the descriptor */ + iop_chan_enable(iop_chan); + } else + dev_printk(KERN_ERR, iop_chan->device->common.dev, + "failed to allocate null descriptor\n"); + spin_unlock_bh(&iop_chan->lock); +} + +static struct platform_driver iop_adma_driver = { + .probe = iop_adma_probe, + .remove = iop_adma_remove, + .driver = { + .owner = THIS_MODULE, + .name = "iop-adma", + }, +}; + +static int __init iop_adma_init (void) +{ + /* it's currently unsafe to unload this module */ + /* if forced, worst case is that rmmod hangs */ + __unsafe(THIS_MODULE); + + return platform_driver_register(&iop_adma_driver); +} + +static void __exit iop_adma_exit (void) +{ + platform_driver_unregister(&iop_adma_driver); + return; +} + +module_init(iop_adma_init); +module_exit(iop_adma_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("IOP ADMA Engine Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 466909f38d98..64bf3a81db93 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -109,6 +109,8 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select ASYNC_MEMCPY + select ASYNC_XOR ---help--- A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 2c45d7683ae9..c49366cdc05d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -18,7 +18,7 @@ raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ hostprogs-y := mktables # Note: link order is important. All raid personalities -# and xor.o must come before md.o, as they each initialise +# and must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. @@ -26,7 +26,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o -obj-$(CONFIG_MD_RAID456) += raid456.o xor.o +obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BLK_DEV_MD) += md-mod.o diff --git a/drivers/md/md.c b/drivers/md/md.c index 1c54f3c1cca7..33beaa7da085 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5814,7 +5814,7 @@ static __exit void md_exit(void) } } -module_init(md_init) +subsys_initcall(md_init); module_exit(md_exit) static int get_ro(char *buffer, struct kernel_param *kp) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 061375ee6592..0b66afef2d82 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -52,6 +52,7 @@ #include "raid6.h" #include +#include /* * Stripe cache @@ -80,7 +81,6 @@ /* * The following can be used to debug the driver */ -#define RAID5_DEBUG 0 #define RAID5_PARANOIA 1 #if RAID5_PARANOIA && defined(CONFIG_SMP) # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) @@ -88,8 +88,7 @@ # define CHECK_DEVLOCK() #endif -#define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x))) -#if RAID5_DEBUG +#ifdef DEBUG #define inline #define __inline__ #endif @@ -104,6 +103,23 @@ static inline int raid6_next_disk(int disk, int raid_disks) disk++; return (disk < raid_disks) ? disk : 0; } + +static void return_io(struct bio *return_bi) +{ + struct bio *bi = return_bi; + while (bi) { + int bytes = bi->bi_size; + + return_bi = bi->bi_next; + bi->bi_next = NULL; + bi->bi_size = 0; + bi->bi_end_io(bi, bytes, + test_bit(BIO_UPTODATE, &bi->bi_flags) + ? 0 : -EIO); + bi = return_bi; + } +} + static void print_raid5_conf (raid5_conf_t *conf); static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) @@ -125,6 +141,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { + BUG_ON(sh->ops.pending); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -152,7 +169,8 @@ static void release_stripe(struct stripe_head *sh) static inline void remove_hash(struct stripe_head *sh) { - PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); + pr_debug("remove_hash(), stripe %llu\n", + (unsigned long long)sh->sector); hlist_del_init(&sh->hash); } @@ -161,7 +179,8 @@ static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { struct hlist_head *hp = stripe_hash(conf, sh->sector); - PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); + pr_debug("insert_hash(), stripe %llu\n", + (unsigned long long)sh->sector); CHECK_DEVLOCK(); hlist_add_head(&sh->hash, hp); @@ -224,9 +243,10 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - + BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + CHECK_DEVLOCK(); - PRINTK("init_stripe called, stripe %llu\n", + pr_debug("init_stripe called, stripe %llu\n", (unsigned long long)sh->sector); remove_hash(sh); @@ -240,11 +260,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -260,11 +280,11 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in struct hlist_node *hn; CHECK_DEVLOCK(); - PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); + pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) if (sh->sector == sector && sh->disks == disks) return sh; - PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); + pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; } @@ -276,7 +296,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector { struct stripe_head *sh; - PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector); + pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); spin_lock_irq(&conf->device_lock); @@ -324,6 +344,579 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } +/* test_and_ack_op() ensures that we only dequeue an operation once */ +#define test_and_ack_op(op, pend) \ +do { \ + if (test_bit(op, &sh->ops.pending) && \ + !test_bit(op, &sh->ops.complete)) { \ + if (test_and_set_bit(op, &sh->ops.ack)) \ + clear_bit(op, &pend); \ + else \ + ack++; \ + } else \ + clear_bit(op, &pend); \ +} while (0) + +/* find new work to run, do not resubmit work that is already + * in flight + */ +static unsigned long get_stripe_work(struct stripe_head *sh) +{ + unsigned long pending; + int ack = 0; + + pending = sh->ops.pending; + + test_and_ack_op(STRIPE_OP_BIOFILL, pending); + test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); + test_and_ack_op(STRIPE_OP_PREXOR, pending); + test_and_ack_op(STRIPE_OP_BIODRAIN, pending); + test_and_ack_op(STRIPE_OP_POSTXOR, pending); + test_and_ack_op(STRIPE_OP_CHECK, pending); + if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) + ack++; + + sh->ops.count -= ack; + BUG_ON(sh->ops.count < 0); + + return pending; +} + +static int +raid5_end_read_request(struct bio *bi, unsigned int bytes_done, int error); +static int +raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error); + +static void ops_run_io(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, disks = sh->disks; + + might_sleep(); + + for (i = disks; i--; ) { + int rw; + struct bio *bi; + mdk_rdev_t *rdev; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) + rw = WRITE; + else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + rw = READ; + else + continue; + + bi = &sh->dev[i].req; + + bi->bi_rw = rw; + if (rw == WRITE) + bi->bi_end_io = raid5_end_write_request; + else + bi->bi_end_io = raid5_end_read_request; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (rdev) { + if (test_bit(STRIPE_SYNCING, &sh->state) || + test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || + test_bit(STRIPE_EXPAND_READY, &sh->state)) + md_sync_acct(rdev->bdev, STRIPE_SECTORS); + + bi->bi_bdev = rdev->bdev; + pr_debug("%s: for %llu schedule op %ld on disc %d\n", + __FUNCTION__, (unsigned long long)sh->sector, + bi->bi_rw, i); + atomic_inc(&sh->count); + bi->bi_sector = sh->sector + rdev->data_offset; + bi->bi_flags = 1 << BIO_UPTODATE; + bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; + bi->bi_idx = 0; + bi->bi_io_vec = &sh->dev[i].vec; + bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, + &rdev->corrected_errors); + generic_make_request(bi); + } else { + if (rw == WRITE) + set_bit(STRIPE_DEGRADED, &sh->state); + pr_debug("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static struct dma_async_tx_descriptor * +async_copy_data(int frombio, struct bio *bio, struct page *page, + sector_t sector, struct dma_async_tx_descriptor *tx) +{ + struct bio_vec *bvl; + struct page *bio_page; + int i; + int page_offset; + + if (bio->bi_sector >= sector) + page_offset = (signed)(bio->bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_sector) * -512; + bio_for_each_segment(bvl, bio, i) { + int len = bio_iovec_idx(bio, i)->bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } + + if (len > 0 && page_offset + len > STRIPE_SIZE) + clen = STRIPE_SIZE - page_offset; + else + clen = len; + + if (clen > 0) { + b_offset += bio_iovec_idx(bio, i)->bv_offset; + bio_page = bio_iovec_idx(bio, i)->bv_page; + if (frombio) + tx = async_memcpy(page, bio_page, page_offset, + b_offset, clen, + ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_SRC, + tx, NULL, NULL); + else + tx = async_memcpy(bio_page, page, b_offset, + page_offset, clen, + ASYNC_TX_DEP_ACK | ASYNC_TX_KMAP_DST, + tx, NULL, NULL); + } + if (clen < len) /* hit end of page */ + break; + page_offset += len; + } + + return tx; +} + +static void ops_complete_biofill(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + struct bio *return_bi = NULL; + raid5_conf_t *conf = sh->raid_conf; + int i, more_to_read = 0; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + /* clear completed biofills */ + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + /* check if this stripe has new incoming reads */ + if (dev->toread) + more_to_read++; + + /* acknowledge completion of a biofill operation */ + /* and check if we need to reply to a read request + */ + if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) { + struct bio *rbi, *rbi2; + clear_bit(R5_Wantfill, &dev->flags); + + /* The access to dev->read is outside of the + * spin_lock_irq(&conf->device_lock), but is protected + * by the STRIPE_OP_BIOFILL pending bit + */ + BUG_ON(!dev->read); + rbi = dev->read; + dev->read = NULL; + while (rbi && rbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + rbi2 = r5_next_bio(rbi, dev->sector); + spin_lock_irq(&conf->device_lock); + if (--rbi->bi_phys_segments == 0) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + } + } + } + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + + return_io(return_bi); + + if (more_to_read) + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void ops_run_biofill(struct stripe_head *sh) +{ + struct dma_async_tx_descriptor *tx = NULL; + raid5_conf_t *conf = sh->raid_conf; + int i; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_Wantfill, &dev->flags)) { + struct bio *rbi; + spin_lock_irq(&conf->device_lock); + dev->read = rbi = dev->toread; + dev->toread = NULL; + spin_unlock_irq(&conf->device_lock); + while (rbi && rbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + tx = async_copy_data(0, rbi, dev->page, + dev->sector, tx); + rbi = r5_next_bio(rbi, dev->sector); + } + } + } + + atomic_inc(&sh->count); + async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, + ops_complete_biofill, sh); +} + +static void ops_complete_compute5(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int target = sh->ops.target; + struct r5dev *tgt = &sh->dev[target]; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + set_bit(R5_UPTODATE, &tgt->flags); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + clear_bit(R5_Wantcompute, &tgt->flags); + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static struct dma_async_tx_descriptor * +ops_run_compute5(struct stripe_head *sh, unsigned long pending) +{ + /* kernel stack size limits the total number of disks */ + int disks = sh->disks; + struct page *xor_srcs[disks]; + int target = sh->ops.target; + struct r5dev *tgt = &sh->dev[target]; + struct page *xor_dest = tgt->page; + int count = 0; + struct dma_async_tx_descriptor *tx; + int i; + + pr_debug("%s: stripe %llu block: %d\n", + __FUNCTION__, (unsigned long long)sh->sector, target); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + + for (i = disks; i--; ) + if (i != target) + xor_srcs[count++] = sh->dev[i].page; + + atomic_inc(&sh->count); + + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, + 0, NULL, ops_complete_compute5, sh); + else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute5, sh); + + /* ack now if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) + async_tx_ack(tx); + + return tx; +} + +static void ops_complete_prexor(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); +} + +static struct dma_async_tx_descriptor * +ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +{ + /* kernel stack size limits the total number of disks */ + int disks = sh->disks; + struct page *xor_srcs[disks]; + int count = 0, pd_idx = sh->pd_idx, i; + + /* existing parity data subtracted */ + struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + /* Only process blocks that are known to be uptodate */ + if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + xor_srcs[count++] = dev->page; + } + + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh); + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +{ + int disks = sh->disks; + int pd_idx = sh->pd_idx, i; + + /* check if prexor is active which means only process blocks + * that are part of a read-modify-write (Wantprexor) + */ + int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + struct bio *chosen; + int towrite; + + towrite = 0; + if (prexor) { /* rmw */ + if (dev->towrite && + test_bit(R5_Wantprexor, &dev->flags)) + towrite = 1; + } else { /* rcw */ + if (i != pd_idx && dev->towrite && + test_bit(R5_LOCKED, &dev->flags)) + towrite = 1; + } + + if (towrite) { + struct bio *wbi; + + spin_lock(&sh->lock); + chosen = dev->towrite; + dev->towrite = NULL; + BUG_ON(dev->written); + wbi = dev->written = chosen; + spin_unlock(&sh->lock); + + while (wbi && wbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); + wbi = r5_next_bio(wbi, dev->sector); + } + } + } + + return tx; +} + +static void ops_complete_postxor(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void ops_complete_write(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int disks = sh->disks, i, pd_idx = sh->pd_idx; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (dev->written || i == pd_idx) + set_bit(R5_UPTODATE, &dev->flags); + } + + set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); + set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +{ + /* kernel stack size limits the total number of disks */ + int disks = sh->disks; + struct page *xor_srcs[disks]; + + int count = 0, pd_idx = sh->pd_idx, i; + struct page *xor_dest; + int prexor = test_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + unsigned long flags; + dma_async_tx_callback callback; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + /* check if prexor is active which means only process blocks + * that are part of a read-modify-write (written) + */ + if (prexor) { + xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (dev->written) + xor_srcs[count++] = dev->page; + } + } else { + xor_dest = sh->dev[pd_idx].page; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i != pd_idx) + xor_srcs[count++] = dev->page; + } + } + + /* check whether this postxor is part of a write */ + callback = test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) ? + ops_complete_write : ops_complete_postxor; + + /* 1/ if we prexor'd then the dest is reused as a source + * 2/ if we did not prexor then we are redoing the parity + * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST + * for the synchronous xor case + */ + flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | + (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); + + atomic_inc(&sh->count); + + if (unlikely(count == 1)) { + flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, + flags, tx, callback, sh); + } else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + flags, tx, callback, sh); +} + +static void ops_complete_check(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int pd_idx = sh->pd_idx; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && + sh->ops.zero_sum_result == 0) + set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + + set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +} + +static void ops_run_check(struct stripe_head *sh) +{ + /* kernel stack size limits the total number of disks */ + int disks = sh->disks; + struct page *xor_srcs[disks]; + struct dma_async_tx_descriptor *tx; + + int count = 0, pd_idx = sh->pd_idx, i; + struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + + pr_debug("%s: stripe %llu\n", __FUNCTION__, + (unsigned long long)sh->sector); + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i != pd_idx) + xor_srcs[count++] = dev->page; + } + + tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); + + if (tx) + set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); + else + clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); + + atomic_inc(&sh->count); + tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, + ops_complete_check, sh); +} + +static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +{ + int overlap_clear = 0, i, disks = sh->disks; + struct dma_async_tx_descriptor *tx = NULL; + + if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + ops_run_biofill(sh); + overlap_clear++; + } + + if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) + tx = ops_run_compute5(sh, pending); + + if (test_bit(STRIPE_OP_PREXOR, &pending)) + tx = ops_run_prexor(sh, tx); + + if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { + tx = ops_run_biodrain(sh, tx); + overlap_clear++; + } + + if (test_bit(STRIPE_OP_POSTXOR, &pending)) + ops_run_postxor(sh, tx); + + if (test_bit(STRIPE_OP_CHECK, &pending)) + ops_run_check(sh); + + if (test_bit(STRIPE_OP_IO, &pending)) + ops_run_io(sh); + + if (overlap_clear) + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } +} + static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; @@ -537,8 +1130,8 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, if (bi == &sh->dev[i].req) break; - PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n", - (unsigned long long)sh->sector, i, atomic_read(&sh->count), + pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", + (unsigned long long)sh->sector, i, atomic_read(&sh->count), uptodate); if (i == disks) { BUG(); @@ -613,7 +1206,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, if (bi == &sh->dev[i].req) break; - PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n", + pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), uptodate); if (i == disks) { @@ -658,7 +1251,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) { char b[BDEVNAME_SIZE]; raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - PRINTK("raid5: error called\n"); + pr_debug("raid5: error called\n"); if (!test_bit(Faulty, &rdev->flags)) { set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -916,137 +1509,13 @@ static void copy_data(int frombio, struct bio *bio, } } -#define check_xor() do { \ - if (count == MAX_XOR_BLOCKS) { \ - xor_block(count, STRIPE_SIZE, ptr); \ - count = 1; \ - } \ +#define check_xor() do { \ + if (count == MAX_XOR_BLOCKS) { \ + xor_blocks(count, STRIPE_SIZE, dest, ptr);\ + count = 0; \ + } \ } while(0) - -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *p; - - PRINTK("compute_block, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - ptr[0] = page_address(sh->dev[dd_idx].page); - memset(ptr[0], 0, STRIPE_SIZE); - count = 1; - for (i = disks ; i--; ) { - if (i == dd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk(KERN_ERR "compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); -} - -static void compute_parity5(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS]; - struct bio *chosen; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - count = 1; - ptr[0] = page_address(sh->dev[pd_idx].page); - switch(method) { - case READ_MODIFY_WRITE: - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); - for (i=disks ; i-- ;) { - if (i==pd_idx) - continue; - if (sh->dev[i].towrite && - test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - ptr[count++] = page_address(sh->dev[i].page); - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - check_xor(); - } - } - break; - case RECONSTRUCT_WRITE: - memset(ptr[0], 0, STRIPE_SIZE); - for (i= disks; i-- ;) - if (i!=pd_idx && sh->dev[i].towrite) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - break; - } - if (count>1) { - xor_block(count, STRIPE_SIZE, ptr); - count = 1; - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); - } - break; - case READ_MODIFY_WRITE: - for (i = disks; i--;) - if (sh->dev[i].written) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); - } - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); - - if (method != CHECK_PARITY) { - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); - } else - clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); -} - static void compute_parity6(struct stripe_head *sh, int method) { raid6_conf_t *conf = sh->raid_conf; @@ -1058,7 +1527,7 @@ static void compute_parity6(struct stripe_head *sh, int method) qd_idx = raid6_next_disk(pd_idx, disks); d0_idx = raid6_next_disk(qd_idx, disks); - PRINTK("compute_parity, stripe %llu, method %d\n", + pr_debug("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); switch(method) { @@ -1132,20 +1601,20 @@ static void compute_parity6(struct stripe_head *sh, int method) static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *p; + void *ptr[MAX_XOR_BLOCKS], *dest, *p; int pd_idx = sh->pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); - PRINTK("compute_block_1, stripe %llu, idx %d\n", + pr_debug("compute_block_1, stripe %llu, idx %d\n", (unsigned long long)sh->sector, dd_idx); if ( dd_idx == qd_idx ) { /* We're actually computing the Q drive */ compute_parity6(sh, UPDATE_PARITY); } else { - ptr[0] = page_address(sh->dev[dd_idx].page); - if (!nozero) memset(ptr[0], 0, STRIPE_SIZE); - count = 1; + dest = page_address(sh->dev[dd_idx].page); + if (!nozero) memset(dest, 0, STRIPE_SIZE); + count = 0; for (i = disks ; i--; ) { if (i == dd_idx || i == qd_idx) continue; @@ -1159,8 +1628,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) check_xor(); } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); + if (count) + xor_blocks(count, STRIPE_SIZE, dest, ptr); if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); } @@ -1183,7 +1652,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) BUG_ON(faila == failb); if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } - PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", + pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); if ( failb == disks-1 ) { @@ -1229,7 +1698,79 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } +static int +handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked = 0; + if (rcw) { + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); + sh->ops.count++; + } + + set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + sh->ops.count++; + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } else { + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + + set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); + set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + + sh->ops.count += 3; + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (i == pd_idx) + continue; + + /* For a read-modify write there may be blocks that are + * locked for reading while others are ready to be + * written so we distinguish these blocks by the + * R5_Wantprexor bit + */ + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantprexor, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + + /* keep the parity disk locked while asynchronous operations + * are in flight + */ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + locked++; + + pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, sh->ops.pending); + + return locked; +} /* * Each stripe/dev can have one or more bion attached. @@ -1242,7 +1783,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in raid5_conf_t *conf = sh->raid_conf; int firstwrite=0; - PRINTK("adding bh b#%llu to stripe s#%llu\n", + pr_debug("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector); @@ -1271,7 +1812,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in spin_unlock_irq(&conf->device_lock); spin_unlock(&sh->lock); - PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n", + pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector, dd_idx); @@ -1326,6 +1867,729 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) return pd_idx; } +static void +handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks, + struct bio **return_bi) +{ + int i; + for (i = disks; i--; ) { + struct bio *bi; + int bitmap_end = 0; + + if (test_bit(R5_ReadError, &sh->dev[i].flags)) { + mdk_rdev_t *rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags)) + /* multiple read failures in one stripe */ + md_error(conf->mddev, rdev); + rcu_read_unlock(); + } + spin_lock_irq(&conf->device_lock); + /* fail all writes first */ + bi = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + if (bi) { + s->to_write--; + bitmap_end = 1; + } + + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + + while (bi && bi->bi_sector < + sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + md_write_end(conf->mddev); + bi->bi_next = *return_bi; + *return_bi = bi; + } + bi = nextbi; + } + /* and fail all 'written' */ + bi = sh->dev[i].written; + sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; + while (bi && bi->bi_sector < + sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + md_write_end(conf->mddev); + bi->bi_next = *return_bi; + *return_bi = bi; + } + bi = bi2; + } + + /* fail any reads if this device is non-operational and + * the data has not reached the cache yet. + */ + if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + (!test_bit(R5_Insync, &sh->dev[i].flags) || + test_bit(R5_ReadError, &sh->dev[i].flags))) { + bi = sh->dev[i].toread; + sh->dev[i].toread = NULL; + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + wake_up(&conf->wait_for_overlap); + if (bi) s->to_read--; + while (bi && bi->bi_sector < + sh->dev[i].sector + STRIPE_SECTORS) { + struct bio *nextbi = + r5_next_bio(bi, sh->dev[i].sector); + clear_bit(BIO_UPTODATE, &bi->bi_flags); + if (--bi->bi_phys_segments == 0) { + bi->bi_next = *return_bi; + *return_bi = bi; + } + bi = nextbi; + } + } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); + } + +} + +/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks + * to process + */ +static int __handle_issuing_new_read_requests5(struct stripe_head *sh, + struct stripe_head_state *s, int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; + struct r5dev *failed_dev = &sh->dev[s->failed_num]; + + /* don't schedule compute operations or reads on the parity block while + * a check is in flight + */ + if ((disk_idx == sh->pd_idx) && + test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + return ~0; + + /* is the data in this block needed, and can we get it? */ + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || (s->failed && + (failed_dev->toread || (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags) + ))))) { + /* 1/ We would like to get this block, possibly by computing it, + * but we might not be able to. + * + * 2/ Since parity check operations potentially make the parity + * block !uptodate it will need to be refreshed before any + * compute operations on data disks are scheduled. + * + * 3/ We hold off parity block re-reads until check operations + * have quiesced. + */ + if ((s->uptodate == disks - 1) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + set_bit(R5_Wantcompute, &dev->flags); + sh->ops.target = disk_idx; + s->req_compute = 1; + sh->ops.count++; + /* Careful: from this point on 'uptodate' is in the eye + * of raid5_run_ops which services 'compute' operations + * before writes. R5_Wantcompute flags a block that will + * be R5_UPTODATE by the time it is needed for a + * subsequent operation. + */ + s->uptodate++; + return 0; /* uptodate + compute == disks */ + } else if ((s->uptodate < disks - 1) && + test_bit(R5_Insync, &dev->flags)) { + /* Note: we hold off compute operations while checks are + * in flight, but we still prefer 'compute' over 'read' + * hence we only read if (uptodate < * disks-1) + */ + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", disk_idx, + s->syncing); + } + } + + return ~0; +} + +static void handle_issuing_new_read_requests5(struct stripe_head *sh, + struct stripe_head_state *s, int disks) +{ + int i; + + /* Clear completed compute operations. Parity recovery + * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled + * later on in this routine + */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute + * is already in flight, or if the stripe contents are in the + * midst of changing due to a write + */ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i = disks; i--; ) + if (__handle_issuing_new_read_requests5( + sh, s, i, disks) == 0) + break; + } + set_bit(STRIPE_HANDLE, &sh->state); +} + +static void handle_issuing_new_read_requests6(struct stripe_head *sh, + struct stripe_head_state *s, struct r6_state *r6s, + int disks) +{ + int i; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || (dev->towrite && + !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed >= 1 && + (sh->dev[r6s->failed_num[0]].toread || + s->to_write)) || + (s->failed >= 2 && + (sh->dev[r6s->failed_num[1]].toread || + s->to_write)))) { + /* we would like to get this block, possibly + * by computing it, but we might not be able to + */ + if (s->uptodate == disks-1) { + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, i); + compute_block_1(sh, i, 0); + s->uptodate++; + } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == i) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; + } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + i, other); + compute_block_2(sh, i, other); + s->uptodate += 2; + } else if (test_bit(R5_Insync, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + pr_debug("Reading block %d (sync=%d)\n", + i, s->syncing); + } + } + } + set_bit(STRIPE_HANDLE, &sh->state); +} + + +/* handle_completed_write_requests + * any written block on an uptodate or failed drive can be returned. + * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but + * never LOCKED, so we don't need to test 'failed' directly. + */ +static void handle_completed_write_requests(raid5_conf_t *conf, + struct stripe_head *sh, int disks, struct bio **return_bi) +{ + int i; + struct r5dev *dev; + + for (i = disks; i--; ) + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { + /* We can return any write requests */ + struct bio *wbi, *wbi2; + int bitmap_end = 0; + pr_debug("Return write for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_sector < + dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (--wbi->bi_phys_segments == 0) { + md_write_end(conf->mddev); + wbi->bi_next = *return_bi; + *return_bi = wbi; + } + wbi = wbi2; + } + if (dev->towrite == NULL) + bitmap_end = 1; + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); + } + } +} + +static void handle_issuing_new_write_requests5(raid5_conf_t *conf, + struct stripe_head *sh, struct stripe_head_state *s, int disks) +{ + int rmw = 0, rcw = 0, i; + for (i = disks; i--; ) { + /* would I have to read this buffer for read_modify_write */ + struct r5dev *dev = &sh->dev[i]; + if ((dev->towrite || i == sh->pd_idx) && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) + rmw++; + else + rmw += 2*disks; /* cannot read it */ + } + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; + else + rcw += 2*disks; + } + } + pr_debug("for sector %llu, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, rmw, rcw); + set_bit(STRIPE_HANDLE, &sh->state); + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if ((dev->towrite || i == sh->pd_idx) && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && + test_bit(R5_Insync, &dev->flags)) { + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old block " + "%d for r-m-w\n", i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit( + STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + s->locked++; + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + if (rcw <= rmw && rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_OVERWRITE, &dev->flags) && + i != sh->pd_idx && + !test_bit(R5_LOCKED, &dev->flags) && + !(test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags)) && + test_bit(R5_Insync, &dev->flags)) { + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old block " + "%d for Reconstruct\n", i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit( + STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + s->locked++; + } else { + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + /* now if nothing is locked, and if we have enough data, + * we can start a write request + */ + /* since handle_stripe can be called at any time we need to handle the + * case where a compute block operation has been submitted and then a + * subsequent call wants to start a write request. raid5_run_ops only + * handles the case where compute block and postxor are requested + * simultaneously. If this is not the case then new writes need to be + * held off until the compute completes. + */ + if ((s->req_compute || + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + s->locked += handle_write_operations5(sh, rcw == 0, 0); +} + +static void handle_issuing_new_write_requests6(raid5_conf_t *conf, + struct stripe_head *sh, struct stripe_head_state *s, + struct r6_state *r6s, int disks) +{ + int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; + int qd_idx = r6s->qd_idx; + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + /* Would I have to read this buffer for reconstruct_write */ + if (!test_bit(R5_OVERWRITE, &dev->flags) + && i != pd_idx && i != qd_idx + && (!test_bit(R5_LOCKED, &dev->flags) + ) && + !test_bit(R5_UPTODATE, &dev->flags)) { + if (test_bit(R5_Insync, &dev->flags)) rcw++; + else { + pr_debug("raid6: must_compute: " + "disk %d flags=%#lx\n", i, dev->flags); + must_compute++; + } + } + } + pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", + (unsigned long long)sh->sector, rcw, must_compute); + set_bit(STRIPE_HANDLE, &sh->state); + + if (rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (!test_bit(R5_OVERWRITE, &dev->flags) + && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) + && !test_bit(R5_LOCKED, &dev->flags) && + !test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Insync, &dev->flags)) { + if ( + test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + pr_debug("Read_old stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantread, &dev->flags); + s->locked++; + } else { + pr_debug("Request delayed stripe %llu " + "block %d for Reconstruct\n", + (unsigned long long)sh->sector, i); + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } + } + /* now if nothing is locked, and if we have enough data, we can start a + * write request + */ + if (s->locked == 0 && rcw == 0 && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { + if (must_compute > 0) { + /* We have failed blocks and need to compute them */ + switch (s->failed) { + case 0: + BUG(); + case 1: + compute_block_1(sh, r6s->failed_num[0], 0); + break; + case 2: + compute_block_2(sh, r6s->failed_num[0], + r6s->failed_num[1]); + break; + default: /* This request should have been failed? */ + BUG(); + } + } + + pr_debug("Computing parity for stripe %llu\n", + (unsigned long long)sh->sector); + compute_parity6(sh, RECONSTRUCT_WRITE); + /* now every locked buffer is ready to be written */ + for (i = disks; i--; ) + if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { + pr_debug("Writing stripe %llu block %d\n", + (unsigned long long)sh->sector, i); + s->locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ + set_bit(STRIPE_INSYNC, &sh->state); + + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + } +} + +static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks) +{ + set_bit(STRIPE_HANDLE, &sh->state); + /* Take one of the following actions: + * 1/ start a check parity operation if (uptodate == disks) + * 2/ finish a check parity operation and act on the result + * 3/ skip to the writeback section if we previously + * initiated a recovery operation + */ + if (s->failed == 0 && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(s->uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + s->uptodate--; + } else if ( + test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += + STRIPE_SECTORS; + if (test_bit( + MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + set_bit(STRIPE_OP_COMPUTE_BLK, + &sh->ops.pending); + set_bit(STRIPE_OP_MOD_REPAIR_PD, + &sh->ops.pending); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + sh->ops.count++; + s->uptodate++; + } + } + } + } + + /* check if we can clear a parity disk reconstruct */ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + + clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* Wait for check parity and compute block operations to complete + * before write-back + */ + if (!test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { + struct r5dev *dev; + /* either failed parity check, or recovery is happening */ + if (s->failed == 0) + s->failed_num = sh->pd_idx; + dev = &sh->dev[s->failed_num]; + BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); + BUG_ON(s->uptodate != disks); + + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + + clear_bit(STRIPE_DEGRADED, &sh->state); + s->locked++; + set_bit(STRIPE_INSYNC, &sh->state); + } +} + + +static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, + struct stripe_head_state *s, + struct r6_state *r6s, struct page *tmp_page, + int disks) +{ + int update_p = 0, update_q = 0; + struct r5dev *dev; + int pd_idx = sh->pd_idx; + int qd_idx = r6s->qd_idx; + + set_bit(STRIPE_HANDLE, &sh->state); + + BUG_ON(s->failed > 2); + BUG_ON(s->uptodate < disks); + /* Want to check and possibly repair P and Q. + * However there could be one 'failed' device, in which + * case we can only check one of them, possibly using the + * other to generate missing data + */ + + /* If !tmp_page, we cannot do the calculations, + * but as we have set STRIPE_HANDLE, we will soon be called + * by stripe_handle with a tmp_page - just wait until then. + */ + if (tmp_page) { + if (s->failed == r6s->q_failed) { + /* The only possible failed device holds 'Q', so it + * makes sense to check P (If anything else were failed, + * we would have used P to recreate it). + */ + compute_block_1(sh, pd_idx, 1); + if (!page_is_zero(sh->dev[pd_idx].page)) { + compute_block_1(sh, pd_idx, 0); + update_p = 1; + } + } + if (!r6s->q_failed && s->failed < 2) { + /* q is not failed, and we didn't use it to generate + * anything, so it makes sense to check it + */ + memcpy(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE); + compute_parity6(sh, UPDATE_PARITY); + if (memcmp(page_address(tmp_page), + page_address(sh->dev[qd_idx].page), + STRIPE_SIZE) != 0) { + clear_bit(STRIPE_INSYNC, &sh->state); + update_q = 1; + } + } + if (update_p || update_q) { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + update_p = update_q = 0; + } + + /* now write out any block on a failed drive, + * or P or Q if they need it + */ + + if (s->failed == 2) { + dev = &sh->dev[r6s->failed_num[1]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (s->failed >= 1) { + dev = &sh->dev[r6s->failed_num[0]]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + + if (update_p) { + dev = &sh->dev[pd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + if (update_q) { + dev = &sh->dev[qd_idx]; + s->locked++; + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } + clear_bit(STRIPE_DEGRADED, &sh->state); + + set_bit(STRIPE_INSYNC, &sh->state); + } +} + +static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, + struct r6_state *r6s) +{ + int i; + + /* We have read all the blocks in this stripe and now we need to + * copy some of them into a target stripe for expand. + */ + struct dma_async_tx_descriptor *tx = NULL; + clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); + for (i = 0; i < sh->disks; i++) + if (i != sh->pd_idx && (r6s && i != r6s->qd_idx)) { + int dd_idx, pd_idx, j; + struct stripe_head *sh2; + + sector_t bn = compute_blocknr(sh, i); + sector_t s = raid5_compute_sector(bn, conf->raid_disks, + conf->raid_disks - + conf->max_degraded, &dd_idx, + &pd_idx, conf); + sh2 = get_active_stripe(conf, s, conf->raid_disks, + pd_idx, 1); + if (sh2 == NULL) + /* so far only the early blocks of this stripe + * have been requested. When later blocks + * get requested, we will try again + */ + continue; + if (!test_bit(STRIPE_EXPANDING, &sh2->state) || + test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { + /* must have already done this block */ + release_stripe(sh2); + continue; + } + + /* place all the copies on one channel */ + tx = async_memcpy(sh2->dev[dd_idx].page, + sh->dev[i].page, 0, 0, STRIPE_SIZE, + ASYNC_TX_DEP_ACK, tx, NULL, NULL); + + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); + set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); + for (j = 0; j < conf->raid_disks; j++) + if (j != sh2->pd_idx && + (r6s && j != r6s->qd_idx) && + !test_bit(R5_Expanded, &sh2->dev[j].flags)) + break; + if (j == conf->raid_disks) { + set_bit(STRIPE_EXPAND_READY, &sh2->state); + set_bit(STRIPE_HANDLE, &sh2->state); + } + release_stripe(sh2); + + /* done submitting copies, wait for them to complete */ + if (i + 1 >= sh->disks) { + async_tx_ack(tx); + dma_wait_for_async_tx(tx); + } + } +} /* * handle_stripe - do things to a stripe. @@ -1339,81 +2603,70 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) * schedule a write of some buffers * return confirmation of parity correctness * - * Parity calculations are done inside the stripe lock * buffers are taken off read_list or write_list, and bh_cache buffers * get BH_Lock set before the stripe lock is released. * */ - + static void handle_stripe5(struct stripe_head *sh) { raid5_conf_t *conf = sh->raid_conf; - int disks = sh->disks; - struct bio *return_bi= NULL; - struct bio *bi; - int i; - int syncing, expanding, expanded; - int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; - int failed_num=0; + int disks = sh->disks, i; + struct bio *return_bi = NULL; + struct stripe_head_state s; struct r5dev *dev; + unsigned long pending = 0; - PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", - (unsigned long long)sh->sector, atomic_read(&sh->count), - sh->pd_idx); + memset(&s, 0, sizeof(s)); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " + "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, + sh->ops.pending, sh->ops.ack, sh->ops.complete); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - syncing = test_bit(STRIPE_SYNCING, &sh->state); - expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s.syncing = test_bit(STRIPE_SYNCING, &sh->state); + s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; - dev = &sh->dev[i]; + struct r5dev *dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } - } + pr_debug("check %d: state 0x%lx toread %p read %p write %p " + "written %p\n", i, dev->flags, dev->toread, dev->read, + dev->towrite, dev->written); + + /* maybe we can request a biofill operation + * + * new wantfill requests are only permitted while + * STRIPE_OP_BIOFILL is clear + */ + if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && + !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + set_bit(R5_Wantfill, &dev->flags); /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; - - if (dev->toread) to_read++; + if (test_bit(R5_Wantfill, &dev->flags)) + s.to_fill++; + else if (dev->toread) + s.to_read++; if (dev->towrite) { - to_write++; + s.to_write++; if (!test_bit(R5_OVERWRITE, &dev->flags)) - non_overwrite++; + s.non_overwrite++; } - if (dev->written) written++; + if (dev->written) + s.written++; rdev = rcu_dereference(conf->disks[i].rdev); if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ @@ -1422,306 +2675,131 @@ static void handle_stripe5(struct stripe_head *sh) } if (!rdev || !test_bit(In_sync, &rdev->flags) || test_bit(R5_ReadError, &dev->flags)) { - failed++; - failed_num = i; + s.failed++; + s.failed_num = i; } else set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); - PRINTK("locked=%d uptodate=%d to_read=%d" + + if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + sh->ops.count++; + + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", - locked, uptodate, to_read, to_write, failed, failed_num); + s.locked, s.uptodate, s.to_read, s.to_write, + s.failed, s.failed_num); /* check if the array has lost two devices and, if so, some requests might * need to be failed */ - if (failed > 1 && to_read+to_write+written) { - for (i=disks; i--; ) { - int bitmap_end = 0; - - if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - mdk_rdev_t *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) - /* multiple read failures in one stripe */ - md_error(conf->mddev, rdev); - rcu_read_unlock(); - } - - spin_lock_irq(&conf->device_lock); - /* fail all writes first */ - bi = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - if (bi) { to_write--; bitmap_end = 1; } - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - bi->bi_next = return_bi; - return_bi = bi; - } - bi = nextbi; - } - /* and fail all 'written' */ - bi = sh->dev[i].written; - sh->dev[i].written = NULL; - if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - bi->bi_next = return_bi; - return_bi = bi; - } - bi = bi2; - } - - /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags) || - test_bit(R5_ReadError, &sh->dev[i].flags)) { - bi = sh->dev[i].toread; - sh->dev[i].toread = NULL; - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - if (bi) to_read--; - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - bi->bi_next = return_bi; - return_bi = bi; - } - bi = nextbi; - } - } - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); - } - } - if (failed > 1 && syncing) { + if (s.failed > 1 && s.to_read+s.to_write+s.written) + handle_requests_to_failed_array(conf, sh, &s, disks, + &return_bi); + if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); - syncing = 0; + s.syncing = 0; } /* might be able to return some write requests if the parity block * is safe, or on a failed drive */ dev = &sh->dev[sh->pd_idx]; - if ( written && - ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) - || (failed == 1 && failed_num == sh->pd_idx)) - ) { - /* any written block on an uptodate or failed drive can be returned. - * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but - * never LOCKED, so we don't need to test 'failed' directly. - */ - for (i=disks; i--; ) - if (sh->dev[i].written) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags) ) { - /* We can return any write requests */ - struct bio *wbi, *wbi2; - int bitmap_end = 0; - PRINTK("Return write for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - wbi = dev->written; - dev->written = NULL; - while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - wbi->bi_next = return_bi; - return_bi = wbi; - } - wbi = wbi2; - } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } - } + if ( s.written && + ((test_bit(R5_Insync, &dev->flags) && + !test_bit(R5_LOCKED, &dev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) || + (s.failed == 1 && s.failed_num == sh->pd_idx))) + handle_completed_write_requests(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - syncing || - expanding || - (failed && (sh->dev[failed_num].toread || - (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) - ) - ) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to - */ - if (uptodate == disks-1) { - PRINTK("Computing block %d\n", i); - compute_block(sh, i); - uptodate++; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; - PRINTK("Reading block %d (sync=%d)\n", - i, syncing); - } - } - } - set_bit(STRIPE_HANDLE, &sh->state); + if (s.to_read || s.non_overwrite || + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + handle_issuing_new_read_requests5(sh, &s, disks); + + /* Now we check to see if any write operations have recently + * completed + */ + + /* leave prexor set until postxor is done, allows us to distinguish + * a rmw from a rcw during biodrain + */ + if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + + for (i = disks; i--; ) + clear_bit(R5_Wantprexor, &sh->dev[i].flags); } - /* now to consider writing and what else, if anything should be read */ - if (to_write) { - int rmw=0, rcw=0; - for (i=disks ; i--;) { - /* would I have to read this buffer for read_modify_write */ + /* if only POSTXOR is set then this is an 'expand' postxor */ + if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); + + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + + /* All the 'written' buffers and the parity block are ready to + * be written back to disk + */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + for (i = disks; i--; ) { dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && - (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags) -/* && !(!mddev->insync && i == sh->pd_idx) */ - ) - rmw++; - else rmw += 2*disks; /* cannot read it */ - } - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && - (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else rcw += 2*disks; + if (test_bit(R5_LOCKED, &dev->flags) && + (i == sh->pd_idx || dev->written)) { + pr_debug("Writing block %d\n", i); + set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit( + STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + if (!test_bit(R5_Insync, &dev->flags) || + (i == sh->pd_idx && s.failed == 0)) + set_bit(STRIPE_INSYNC, &sh->state); } } - PRINTK("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); - set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) - /* prefer read-modify-write, but need to get some data */ - for (i=disks; i--;) { - dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx) && - !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - { - PRINTK("Read_old block %d for r-m-w\n", i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; - } else { - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - if (rcw <= rmw && rcw > 0) - /* want reconstruct write, but need to get some data */ - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && - !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - { - PRINTK("Read_old block %d for Reconstruct\n", i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; - } else { - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && (rcw == 0 ||rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - PRINTK("Computing parity...\n"); - compute_parity5(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - /* now every locked buffer is ready to be written */ - for (i=disks; i--;) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - PRINTK("Writing block %d\n", i); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - if (!test_bit(R5_Insync, &sh->dev[i].flags) - || (i==sh->pd_idx && failed == 0)) - set_bit(STRIPE_INSYNC, &sh->state); - } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); } } + /* Now to consider new write requests and what else, if anything + * should be read. We do not handle new writes when: + * 1/ A 'write' operation (copy+xor) is already in flight. + * 2/ A 'check' operation is in flight, as it may clobber the parity + * block. + */ + if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) + handle_issuing_new_write_requests5(conf, sh, &s, disks); + /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough data - * is available + * Any reads will already have been scheduled, so we just see if enough + * data is available. The parity check is held off while parity + * dependent operations are in flight. */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state)) { - set_bit(STRIPE_HANDLE, &sh->state); - if (failed == 0) { - BUG_ON(uptodate != disks); - compute_parity5(sh, CHECK_PARITY); - uptodate--; - if (page_is_zero(sh->dev[sh->pd_idx].page)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - compute_block(sh, sh->pd_idx); - uptodate++; - } - } - } - if (!test_bit(STRIPE_INSYNC, &sh->state)) { - /* either failed parity check, or recovery is happening */ - if (failed==0) - failed_num = sh->pd_idx; - dev = &sh->dev[failed_num]; - BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); - BUG_ON(uptodate != disks); + if ((s.syncing && s.locked == 0 && + !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_INSYNC, &sh->state)) || + test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + handle_parity_checks5(conf, sh, &s, disks); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - clear_bit(STRIPE_DEGRADED, &sh->state); - locked++; - set_bit(STRIPE_INSYNC, &sh->state); - } - } - if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } @@ -1729,186 +2807,102 @@ static void handle_stripe5(struct stripe_head *sh) /* If the failed drive is just a ReadError, then we might need to progress * the repair/check process */ - if (failed == 1 && ! conf->mddev->ro && - test_bit(R5_ReadError, &sh->dev[failed_num].flags) - && !test_bit(R5_LOCKED, &sh->dev[failed_num].flags) - && test_bit(R5_UPTODATE, &sh->dev[failed_num].flags) + if (s.failed == 1 && !conf->mddev->ro && + test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) + && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) + && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) ) { - dev = &sh->dev[failed_num]; + dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); - locked++; + s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); - locked++; + s.locked++; } } - if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Finish postxor operations initiated by the expansion + * process + */ + if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && + !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { + + clear_bit(STRIPE_EXPANDING, &sh->state); + + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; + } + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); - for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (expanded) { + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, + conf->raid_disks); + s.locked += handle_write_operations5(sh, 0, 1); + } else if (s.expanded && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); md_done_sync(conf->mddev, STRIPE_SECTORS, 1); } - if (expanding && locked == 0) { - /* We have read all the blocks in this stripe and now we need to - * copy some of them into a target stripe for expand. - */ - clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); - for (i=0; i< sh->disks; i++) - if (i != sh->pd_idx) { - int dd_idx, pd_idx, j; - struct stripe_head *sh2; + if (s.expanding && s.locked == 0) + handle_stripe_expansion(conf, sh, NULL); - sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks-1, - &dd_idx, &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1); - if (sh2 == NULL) - /* so far only the early blocks of this stripe - * have been requested. When later blocks - * get requested, we will try again - */ - continue; - if(!test_bit(STRIPE_EXPANDING, &sh2->state) || - test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { - /* must have already done this block */ - release_stripe(sh2); - continue; - } - memcpy(page_address(sh2->dev[dd_idx].page), - page_address(sh->dev[i].page), - STRIPE_SIZE); - set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); - set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); - for (j=0; jraid_disks; j++) - if (j != sh2->pd_idx && - !test_bit(R5_Expanded, &sh2->dev[j].flags)) - break; - if (j == conf->raid_disks) { - set_bit(STRIPE_EXPAND_READY, &sh2->state); - set_bit(STRIPE_HANDLE, &sh2->state); - } - release_stripe(sh2); - } - } + if (sh->ops.count) + pending = get_stripe_work(sh); spin_unlock(&sh->lock); - while ((bi=return_bi)) { - int bytes = bi->bi_size; + if (pending) + raid5_run_ops(sh, pending); - return_bi = bi->bi_next; - bi->bi_next = NULL; - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); - } - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (syncing || expanding || expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + return_io(return_bi); - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - PRINTK("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } } static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; int disks = sh->disks; - struct bio *return_bi= NULL; - struct bio *bi; - int i; - int syncing, expanding, expanded; - int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; - int failed_num[2] = {0, 0}; + struct bio *return_bi = NULL; + int i, pd_idx = sh->pd_idx; + struct stripe_head_state s; + struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int p_failed, q_failed; - PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n", - (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), - pd_idx, qd_idx); + r6s.qd_idx = raid6_next_disk(pd_idx, disks); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, " + "pd_idx=%d, qd_idx=%d\n", + (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), pd_idx, r6s.qd_idx); + memset(&s, 0, sizeof(s)); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - syncing = test_bit(STRIPE_SYNCING, &sh->state); - expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s.syncing = test_bit(STRIPE_SYNCING, &sh->state); + s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -1917,12 +2911,12 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", + pr_debug("check %d: state 0x%lx read %p write %p written %p\n", i, dev->flags, dev->toread, dev->towrite, dev->written); /* maybe we can reply to a read */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); + pr_debug("Return read for disc %d\n", i); spin_lock_irq(&conf->device_lock); rbi = dev->toread; dev->toread = NULL; @@ -1943,17 +2937,19 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; - if (dev->toread) to_read++; + if (dev->toread) + s.to_read++; if (dev->towrite) { - to_write++; + s.to_write++; if (!test_bit(R5_OVERWRITE, &dev->flags)) - non_overwrite++; + s.non_overwrite++; } - if (dev->written) written++; + if (dev->written) + s.written++; rdev = rcu_dereference(conf->disks[i].rdev); if (!rdev || !test_bit(In_sync, &rdev->flags)) { /* The ReadError flag will just be confusing now */ @@ -1962,96 +2958,27 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (!rdev || !test_bit(In_sync, &rdev->flags) || test_bit(R5_ReadError, &dev->flags)) { - if ( failed < 2 ) - failed_num[failed] = i; - failed++; + if (s.failed < 2) + r6s.failed_num[s.failed] = i; + s.failed++; } else set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); - PRINTK("locked=%d uptodate=%d to_read=%d" + pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d,%d\n", - locked, uptodate, to_read, to_write, failed, - failed_num[0], failed_num[1]); - /* check if the array has lost >2 devices and, if so, some requests might - * need to be failed + s.locked, s.uptodate, s.to_read, s.to_write, s.failed, + r6s.failed_num[0], r6s.failed_num[1]); + /* check if the array has lost >2 devices and, if so, some requests + * might need to be failed */ - if (failed > 2 && to_read+to_write+written) { - for (i=disks; i--; ) { - int bitmap_end = 0; - - if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - mdk_rdev_t *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) - /* multiple read failures in one stripe */ - md_error(conf->mddev, rdev); - rcu_read_unlock(); - } - - spin_lock_irq(&conf->device_lock); - /* fail all writes first */ - bi = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - if (bi) { to_write--; bitmap_end = 1; } - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - bi->bi_next = return_bi; - return_bi = bi; - } - bi = nextbi; - } - /* and fail all 'written' */ - bi = sh->dev[i].written; - sh->dev[i].written = NULL; - if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - bi->bi_next = return_bi; - return_bi = bi; - } - bi = bi2; - } - - /* fail any reads if this device is non-operational */ - if (!test_bit(R5_Insync, &sh->dev[i].flags) || - test_bit(R5_ReadError, &sh->dev[i].flags)) { - bi = sh->dev[i].toread; - sh->dev[i].toread = NULL; - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - if (bi) to_read--; - while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { - bi->bi_next = return_bi; - return_bi = bi; - } - bi = nextbi; - } - } - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); - } - } - if (failed > 2 && syncing) { + if (s.failed > 2 && s.to_read+s.to_write+s.written) + handle_requests_to_failed_array(conf, sh, &s, disks, + &return_bi); + if (s.failed > 2 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); - syncing = 0; + s.syncing = 0; } /* @@ -2059,279 +2986,41 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * are safe, or on a failed drive */ pdev = &sh->dev[pd_idx]; - p_failed = (failed >= 1 && failed_num[0] == pd_idx) - || (failed >= 2 && failed_num[1] == pd_idx); - qdev = &sh->dev[qd_idx]; - q_failed = (failed >= 1 && failed_num[0] == qd_idx) - || (failed >= 2 && failed_num[1] == qd_idx); + r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); + qdev = &sh->dev[r6s.qd_idx]; + r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) + || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); - if ( written && - ( p_failed || ((test_bit(R5_Insync, &pdev->flags) + if ( s.written && + ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags))) ) && - ( q_failed || ((test_bit(R5_Insync, &qdev->flags) + && test_bit(R5_UPTODATE, &pdev->flags)))) && + ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))) ) ) { - /* any written block on an uptodate or failed drive can be - * returned. Note that if we 'wrote' to a failed drive, - * it will be UPTODATE, but never LOCKED, so we don't need - * to test 'failed' directly. - */ - for (i=disks; i--; ) - if (sh->dev[i].written) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags) ) { - /* We can return any write requests */ - int bitmap_end = 0; - struct bio *wbi, *wbi2; - PRINTK("Return write for stripe %llu disc %d\n", - (unsigned long long)sh->sector, i); - spin_lock_irq(&conf->device_lock); - wbi = dev->written; - dev->written = NULL; - while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { - md_write_end(conf->mddev); - wbi->bi_next = return_bi; - return_bi = wbi; - } - wbi = wbi2; - } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, - !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } - } + && test_bit(R5_UPTODATE, &qdev->flags))))) + handle_completed_write_requests(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (to_write && failed) || - (syncing && (uptodate < disks)) || expanding) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - syncing || - expanding || - (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || - (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) - ) - ) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to - */ - if (uptodate == disks-1) { - PRINTK("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - compute_block_1(sh, i, 0); - uptodate++; - } else if ( uptodate == disks-2 && failed >= 2 ) { - /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ - int other; - for (other=disks; other--;) { - if ( other == i ) - continue; - if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) ) - break; - } - BUG_ON(other < 0); - PRINTK("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, i, other); - compute_block_2(sh, i, other); - uptodate += 2; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; - PRINTK("Reading block %d (sync=%d)\n", - i, syncing); - } - } - } - set_bit(STRIPE_HANDLE, &sh->state); - } + if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || + (s.syncing && (s.uptodate < disks)) || s.expanding) + handle_issuing_new_read_requests6(sh, &s, &r6s, disks); /* now to consider writing and what else, if anything should be read */ - if (to_write) { - int rcw=0, must_compute=0; - for (i=disks ; i--;) { - dev = &sh->dev[i]; - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) - && i != pd_idx && i != qd_idx - && (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else { - PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags); - must_compute++; - } - } - } - PRINTK("for sector %llu, rcw=%d, must_compute=%d\n", - (unsigned long long)sh->sector, rcw, must_compute); - set_bit(STRIPE_HANDLE, &sh->state); - - if (rcw > 0) - /* want reconstruct write, but need to get some data */ - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) - && !(failed == 0 && (i == pd_idx || i == qd_idx)) - && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - { - PRINTK("Read_old stripe %llu block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; - } else { - PRINTK("Request delayed stripe %llu block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && rcw == 0 && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - if ( must_compute > 0 ) { - /* We have failed blocks and need to compute them */ - switch ( failed ) { - case 0: BUG(); - case 1: compute_block_1(sh, failed_num[0], 0); break; - case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; - default: BUG(); /* This request should have been failed? */ - } - } - - PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector); - compute_parity6(sh, RECONSTRUCT_WRITE); - /* now every locked buffer is ready to be written */ - for (i=disks; i--;) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - PRINTK("Writing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ - set_bit(STRIPE_INSYNC, &sh->state); - - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - } - } + if (s.to_write) + handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough data - * is available + * Any reads will already have been scheduled, so we just see if enough + * data is available */ - if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) { - int update_p = 0, update_q = 0; - struct r5dev *dev; + if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) + handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); - set_bit(STRIPE_HANDLE, &sh->state); - - BUG_ON(failed>2); - BUG_ON(uptodate < disks); - /* Want to check and possibly repair P and Q. - * However there could be one 'failed' device, in which - * case we can only check one of them, possibly using the - * other to generate missing data - */ - - /* If !tmp_page, we cannot do the calculations, - * but as we have set STRIPE_HANDLE, we will soon be called - * by stripe_handle with a tmp_page - just wait until then. - */ - if (tmp_page) { - if (failed == q_failed) { - /* The only possible failed device holds 'Q', so it makes - * sense to check P (If anything else were failed, we would - * have used P to recreate it). - */ - compute_block_1(sh, pd_idx, 1); - if (!page_is_zero(sh->dev[pd_idx].page)) { - compute_block_1(sh,pd_idx,0); - update_p = 1; - } - } - if (!q_failed && failed < 2) { - /* q is not failed, and we didn't use it to generate - * anything, so it makes sense to check it - */ - memcpy(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE); - compute_parity6(sh, UPDATE_PARITY); - if (memcmp(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE)!= 0) { - clear_bit(STRIPE_INSYNC, &sh->state); - update_q = 1; - } - } - if (update_p || update_q) { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - update_p = update_q = 0; - } - - /* now write out any block on a failed drive, - * or P or Q if they need it - */ - - if (failed == 2) { - dev = &sh->dev[failed_num[1]]; - locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (failed >= 1) { - dev = &sh->dev[failed_num[0]]; - locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - - if (update_p) { - dev = &sh->dev[pd_idx]; - locked ++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - if (update_q) { - dev = &sh->dev[qd_idx]; - locked++; - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantwrite, &dev->flags); - } - clear_bit(STRIPE_DEGRADED, &sh->state); - - set_bit(STRIPE_INSYNC, &sh->state); - } - } - - if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS,1); clear_bit(STRIPE_SYNCING, &sh->state); } @@ -2339,9 +3028,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) /* If the failed drives are just a ReadError, then we might need * to progress the repair/check process */ - if (failed <= 2 && ! conf->mddev->ro) - for (i=0; idev[failed_num[i]]; + if (s.failed <= 2 && !conf->mddev->ro) + for (i = 0; i < s.failed; i++) { + dev = &sh->dev[r6s.failed_num[i]]; if (test_bit(R5_ReadError, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) @@ -2358,7 +3047,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } } - if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { /* Need to write out all blocks after computing P&Q */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, @@ -2366,82 +3055,24 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) compute_parity6(sh, RECONSTRUCT_WRITE); for (i = conf->raid_disks ; i-- ; ) { set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; + s.locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); } clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (expanded) { + } else if (s.expanded) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); md_done_sync(conf->mddev, STRIPE_SECTORS, 1); } - if (expanding && locked == 0) { - /* We have read all the blocks in this stripe and now we need to - * copy some of them into a target stripe for expand. - */ - clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); - for (i = 0; i < sh->disks ; i++) - if (i != pd_idx && i != qd_idx) { - int dd_idx2, pd_idx2, j; - struct stripe_head *sh2; - - sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector( - bn, conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx2, &pd_idx2, conf); - sh2 = get_active_stripe(conf, s, - conf->raid_disks, - pd_idx2, 1); - if (sh2 == NULL) - /* so for only the early blocks of - * this stripe have been requests. - * When later blocks get requests, we - * will try again - */ - continue; - if (!test_bit(STRIPE_EXPANDING, &sh2->state) || - test_bit(R5_Expanded, - &sh2->dev[dd_idx2].flags)) { - /* must have already done this block */ - release_stripe(sh2); - continue; - } - memcpy(page_address(sh2->dev[dd_idx2].page), - page_address(sh->dev[i].page), - STRIPE_SIZE); - set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags); - set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags); - for (j = 0 ; j < conf->raid_disks ; j++) - if (j != sh2->pd_idx && - j != raid6_next_disk(sh2->pd_idx, - sh2->disks) && - !test_bit(R5_Expanded, - &sh2->dev[j].flags)) - break; - if (j == conf->raid_disks) { - set_bit(STRIPE_EXPAND_READY, - &sh2->state); - set_bit(STRIPE_HANDLE, &sh2->state); - } - release_stripe(sh2); - } - } + if (s.expanding && s.locked == 0) + handle_stripe_expansion(conf, sh, &r6s); spin_unlock(&sh->lock); - while ((bi=return_bi)) { - int bytes = bi->bi_size; + return_io(return_bi); - return_bi = bi->bi_next; - bi->bi_next = NULL; - bi->bi_size = 0; - bi->bi_end_io(bi, bytes, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); - } for (i=disks; i-- ;) { int rw; struct bio *bi; @@ -2470,11 +3101,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) rcu_read_unlock(); if (rdev) { - if (syncing || expanding || expanded) + if (s.syncing || s.expanding || s.expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", + pr_debug("for %llu schedule op %ld on disc %d\n", (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); bi->bi_sector = sh->sector + rdev->data_offset; @@ -2494,7 +3125,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } else { if (rw == WRITE) set_bit(STRIPE_DEGRADED, &sh->state); - PRINTK("skip op %ld on disc %d for sector %llu\n", + pr_debug("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); @@ -2738,7 +3369,7 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) } - PRINTK("raid5_align_endio : io error...handing IO for a retry\n"); + pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); add_bio_to_retry(raid_bi, conf); return 0; @@ -2776,7 +3407,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) mdk_rdev_t *rdev; if (!in_chunk_boundary(mddev, raid_bio)) { - PRINTK("chunk_aligned_read : non aligned\n"); + pr_debug("chunk_aligned_read : non aligned\n"); return 0; } /* @@ -2900,7 +3531,7 @@ static int make_request(request_queue_t *q, struct bio * bi) new_sector = raid5_compute_sector(logical_sector, disks, data_disks, &dd_idx, &pd_idx, conf); - PRINTK("raid5: make_request, sector %llu logical %llu\n", + pr_debug("raid5: make_request, sector %llu logical %llu\n", (unsigned long long)new_sector, (unsigned long long)logical_sector); @@ -3273,7 +3904,7 @@ static void raid5d (mddev_t *mddev) raid5_conf_t *conf = mddev_to_conf(mddev); int handled; - PRINTK("+++ raid5d active\n"); + pr_debug("+++ raid5d active\n"); md_check_recovery(mddev); @@ -3308,8 +3939,10 @@ static void raid5d (mddev_t *mddev) handled++; } - if (list_empty(&conf->handle_list)) + if (list_empty(&conf->handle_list)) { + async_tx_issue_pending_all(); break; + } first = conf->handle_list.next; sh = list_entry(first, struct stripe_head, lru); @@ -3325,13 +3958,13 @@ static void raid5d (mddev_t *mddev) spin_lock_irq(&conf->device_lock); } - PRINTK("%d stripes handled\n", handled); + pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); unplug_slaves(mddev); - PRINTK("--- raid5d inactive\n"); + pr_debug("--- raid5d inactive\n"); } static ssize_t @@ -3507,7 +4140,7 @@ static int run(mddev_t *mddev) atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); - PRINTK("raid5: run(%s) called.\n", mdname(mddev)); + pr_debug("raid5: run(%s) called.\n", mdname(mddev)); ITERATE_RDEV(mddev,rdev,tmp) { raid_disk = rdev->raid_disk; @@ -3690,7 +4323,7 @@ static int stop(mddev_t *mddev) return 0; } -#if RAID5_DEBUG +#ifdef DEBUG static void print_sh (struct seq_file *seq, struct stripe_head *sh) { int i; @@ -3737,7 +4370,7 @@ static void status (struct seq_file *seq, mddev_t *mddev) conf->disks[i].rdev && test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); -#if RAID5_DEBUG +#ifdef DEBUG seq_printf (seq, "\n"); printall(seq, conf); #endif diff --git a/include/asm-arm/arch-iop13xx/adma.h b/include/asm-arm/arch-iop13xx/adma.h new file mode 100644 index 000000000000..04006c1c5fd7 --- /dev/null +++ b/include/asm-arm/arch-iop13xx/adma.h @@ -0,0 +1,544 @@ +/* + * Copyright(c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef _ADMA_H +#define _ADMA_H +#include +#include +#include +#include + +#define ADMA_ACCR(chan) (chan->mmr_base + 0x0) +#define ADMA_ACSR(chan) (chan->mmr_base + 0x4) +#define ADMA_ADAR(chan) (chan->mmr_base + 0x8) +#define ADMA_IIPCR(chan) (chan->mmr_base + 0x18) +#define ADMA_IIPAR(chan) (chan->mmr_base + 0x1c) +#define ADMA_IIPUAR(chan) (chan->mmr_base + 0x20) +#define ADMA_ANDAR(chan) (chan->mmr_base + 0x24) +#define ADMA_ADCR(chan) (chan->mmr_base + 0x28) +#define ADMA_CARMD(chan) (chan->mmr_base + 0x2c) +#define ADMA_ABCR(chan) (chan->mmr_base + 0x30) +#define ADMA_DLADR(chan) (chan->mmr_base + 0x34) +#define ADMA_DUADR(chan) (chan->mmr_base + 0x38) +#define ADMA_SLAR(src, chan) (chan->mmr_base + (0x3c + (src << 3))) +#define ADMA_SUAR(src, chan) (chan->mmr_base + (0x40 + (src << 3))) + +struct iop13xx_adma_src { + u32 src_addr; + union { + u32 upper_src_addr; + struct { + unsigned int pq_upper_src_addr:24; + unsigned int pq_dmlt:8; + }; + }; +}; + +struct iop13xx_adma_desc_ctrl { + unsigned int int_en:1; + unsigned int xfer_dir:2; + unsigned int src_select:4; + unsigned int zero_result:1; + unsigned int block_fill_en:1; + unsigned int crc_gen_en:1; + unsigned int crc_xfer_dis:1; + unsigned int crc_seed_fetch_dis:1; + unsigned int status_write_back_en:1; + unsigned int endian_swap_en:1; + unsigned int reserved0:2; + unsigned int pq_update_xfer_en:1; + unsigned int dual_xor_en:1; + unsigned int pq_xfer_en:1; + unsigned int p_xfer_dis:1; + unsigned int reserved1:10; + unsigned int relax_order_en:1; + unsigned int no_snoop_en:1; +}; + +struct iop13xx_adma_byte_count { + unsigned int byte_count:24; + unsigned int host_if:3; + unsigned int reserved:2; + unsigned int zero_result_err_q:1; + unsigned int zero_result_err:1; + unsigned int tx_complete:1; +}; + +struct iop13xx_adma_desc_hw { + u32 next_desc; + union { + u32 desc_ctrl; + struct iop13xx_adma_desc_ctrl desc_ctrl_field; + }; + union { + u32 crc_addr; + u32 block_fill_data; + u32 q_dest_addr; + }; + union { + u32 byte_count; + struct iop13xx_adma_byte_count byte_count_field; + }; + union { + u32 dest_addr; + u32 p_dest_addr; + }; + union { + u32 upper_dest_addr; + u32 pq_upper_dest_addr; + }; + struct iop13xx_adma_src src[1]; +}; + +struct iop13xx_adma_desc_dual_xor { + u32 next_desc; + u32 desc_ctrl; + u32 reserved; + u32 byte_count; + u32 h_dest_addr; + u32 h_upper_dest_addr; + u32 src0_addr; + u32 upper_src0_addr; + u32 src1_addr; + u32 upper_src1_addr; + u32 h_src_addr; + u32 h_upper_src_addr; + u32 d_src_addr; + u32 d_upper_src_addr; + u32 d_dest_addr; + u32 d_upper_dest_addr; +}; + +struct iop13xx_adma_desc_pq_update { + u32 next_desc; + u32 desc_ctrl; + u32 reserved; + u32 byte_count; + u32 p_dest_addr; + u32 p_upper_dest_addr; + u32 src0_addr; + u32 upper_src0_addr; + u32 src1_addr; + u32 upper_src1_addr; + u32 p_src_addr; + u32 p_upper_src_addr; + u32 q_src_addr; + struct { + unsigned int q_upper_src_addr:24; + unsigned int q_dmlt:8; + }; + u32 q_dest_addr; + u32 q_upper_dest_addr; +}; + +static inline int iop_adma_get_max_xor(void) +{ + return 16; +} + +static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan) +{ + return __raw_readl(ADMA_ADAR(chan)); +} + +static inline void iop_chan_set_next_descriptor(struct iop_adma_chan *chan, + u32 next_desc_addr) +{ + __raw_writel(next_desc_addr, ADMA_ANDAR(chan)); +} + +#define ADMA_STATUS_BUSY (1 << 13) + +static inline char iop_chan_is_busy(struct iop_adma_chan *chan) +{ + if (__raw_readl(ADMA_ACSR(chan)) & + ADMA_STATUS_BUSY) + return 1; + else + return 0; +} + +static inline int +iop_chan_get_desc_align(struct iop_adma_chan *chan, int num_slots) +{ + return 1; +} +#define iop_desc_is_aligned(x, y) 1 + +static inline int +iop_chan_memcpy_slot_count(size_t len, int *slots_per_op) +{ + *slots_per_op = 1; + return 1; +} + +#define iop_chan_interrupt_slot_count(s, c) iop_chan_memcpy_slot_count(0, s) + +static inline int +iop_chan_memset_slot_count(size_t len, int *slots_per_op) +{ + *slots_per_op = 1; + return 1; +} + +static inline int +iop_chan_xor_slot_count(size_t len, int src_cnt, int *slots_per_op) +{ + int num_slots; + /* slots_to_find = 1 for basic descriptor + 1 per 4 sources above 1 + * (1 source => 8 bytes) (1 slot => 32 bytes) + */ + num_slots = 1 + (((src_cnt - 1) << 3) >> 5); + if (((src_cnt - 1) << 3) & 0x1f) + num_slots++; + + *slots_per_op = num_slots; + + return num_slots; +} + +#define ADMA_MAX_BYTE_COUNT (16 * 1024 * 1024) +#define IOP_ADMA_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT +#define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT +#define IOP_ADMA_XOR_MAX_BYTE_COUNT ADMA_MAX_BYTE_COUNT +#define iop_chan_zero_sum_slot_count(l, s, o) iop_chan_xor_slot_count(l, s, o) + +static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + return hw_desc->dest_addr; +} + +static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + return hw_desc->byte_count_field.byte_count; +} + +static inline u32 iop_desc_get_src_addr(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan, + int src_idx) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + return hw_desc->src[src_idx].src_addr; +} + +static inline u32 iop_desc_get_src_count(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + return hw_desc->desc_ctrl_field.src_select + 1; +} + +static inline void +iop_desc_init_memcpy(struct iop_adma_desc_slot *desc, int int_en) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop13xx_adma_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */ + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; + hw_desc->crc_addr = 0; +} + +static inline void +iop_desc_init_memset(struct iop_adma_desc_slot *desc, int int_en) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop13xx_adma_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */ + u_desc_ctrl.field.block_fill_en = 1; + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; + hw_desc->crc_addr = 0; +} + +/* to do: support buffers larger than ADMA_MAX_BYTE_COUNT */ +static inline void +iop_desc_init_xor(struct iop_adma_desc_slot *desc, int src_cnt, int int_en) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop13xx_adma_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + u_desc_ctrl.field.src_select = src_cnt - 1; + u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */ + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; + hw_desc->crc_addr = 0; + +} +#define iop_desc_init_null_xor(d, s, i) iop_desc_init_xor(d, s, i) + +/* to do: support buffers larger than ADMA_MAX_BYTE_COUNT */ +static inline int +iop_desc_init_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt, int int_en) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop13xx_adma_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + u_desc_ctrl.field.src_select = src_cnt - 1; + u_desc_ctrl.field.xfer_dir = 3; /* local to internal bus */ + u_desc_ctrl.field.zero_result = 1; + u_desc_ctrl.field.status_write_back_en = 1; + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; + hw_desc->crc_addr = 0; + + return 1; +} + +static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan, + u32 byte_count) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + hw_desc->byte_count = byte_count; +} + +static inline void +iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len) +{ + int slots_per_op = desc->slots_per_op; + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc, *iter; + int i = 0; + + if (len <= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT) { + hw_desc->byte_count = len; + } else { + do { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iter->byte_count = IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT; + len -= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT; + i += slots_per_op; + } while (len > IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT); + + if (len) { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iter->byte_count = len; + } + } +} + + +static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan, + dma_addr_t addr) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + hw_desc->dest_addr = addr; + hw_desc->upper_dest_addr = 0; +} + +static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc, + dma_addr_t addr) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + hw_desc->src[0].src_addr = addr; + hw_desc->src[0].upper_src_addr = 0; +} + +static inline void iop_desc_set_xor_src_addr(struct iop_adma_desc_slot *desc, + int src_idx, dma_addr_t addr) +{ + int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op; + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc, *iter; + int i = 0; + + do { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iter->src[src_idx].src_addr = addr; + iter->src[src_idx].upper_src_addr = 0; + slot_cnt -= slots_per_op; + if (slot_cnt) { + i += slots_per_op; + addr += IOP_ADMA_XOR_MAX_BYTE_COUNT; + } + } while (slot_cnt); +} + +static inline void +iop_desc_init_interrupt(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + iop_desc_init_memcpy(desc, 1); + iop_desc_set_byte_count(desc, chan, 0); + iop_desc_set_dest_addr(desc, chan, 0); + iop_desc_set_memcpy_src_addr(desc, 0); +} + +#define iop_desc_set_zero_sum_src_addr iop_desc_set_xor_src_addr + +static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc, + u32 next_desc_addr) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + BUG_ON(hw_desc->next_desc); + hw_desc->next_desc = next_desc_addr; +} + +static inline u32 iop_desc_get_next_desc(struct iop_adma_desc_slot *desc) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + return hw_desc->next_desc; +} + +static inline void iop_desc_clear_next_desc(struct iop_adma_desc_slot *desc) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + hw_desc->next_desc = 0; +} + +static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc, + u32 val) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + hw_desc->block_fill_data = val; +} + +static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) +{ + struct iop13xx_adma_desc_hw *hw_desc = desc->hw_desc; + struct iop13xx_adma_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; + struct iop13xx_adma_byte_count byte_count = hw_desc->byte_count_field; + + BUG_ON(!(byte_count.tx_complete && desc_ctrl.zero_result)); + + if (desc_ctrl.pq_xfer_en) + return byte_count.zero_result_err_q; + else + return byte_count.zero_result_err; +} + +static inline void iop_chan_append(struct iop_adma_chan *chan) +{ + u32 adma_accr; + + adma_accr = __raw_readl(ADMA_ACCR(chan)); + adma_accr |= 0x2; + __raw_writel(adma_accr, ADMA_ACCR(chan)); +} + +static inline void iop_chan_idle(int busy, struct iop_adma_chan *chan) +{ + do { } while (0); +} + +static inline u32 iop_chan_get_status(struct iop_adma_chan *chan) +{ + return __raw_readl(ADMA_ACSR(chan)); +} + +static inline void iop_chan_disable(struct iop_adma_chan *chan) +{ + u32 adma_chan_ctrl = __raw_readl(ADMA_ACCR(chan)); + adma_chan_ctrl &= ~0x1; + __raw_writel(adma_chan_ctrl, ADMA_ACCR(chan)); +} + +static inline void iop_chan_enable(struct iop_adma_chan *chan) +{ + u32 adma_chan_ctrl; + + adma_chan_ctrl = __raw_readl(ADMA_ACCR(chan)); + adma_chan_ctrl |= 0x1; + __raw_writel(adma_chan_ctrl, ADMA_ACCR(chan)); +} + +static inline void iop_adma_device_clear_eot_status(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(ADMA_ACSR(chan)); + status &= (1 << 12); + __raw_writel(status, ADMA_ACSR(chan)); +} + +static inline void iop_adma_device_clear_eoc_status(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(ADMA_ACSR(chan)); + status &= (1 << 11); + __raw_writel(status, ADMA_ACSR(chan)); +} + +static inline void iop_adma_device_clear_err_status(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(ADMA_ACSR(chan)); + status &= (1 << 9) | (1 << 5) | (1 << 4) | (1 << 3); + __raw_writel(status, ADMA_ACSR(chan)); +} + +static inline int +iop_is_err_int_parity(unsigned long status, struct iop_adma_chan *chan) +{ + return test_bit(9, &status); +} + +static inline int +iop_is_err_mcu_abort(unsigned long status, struct iop_adma_chan *chan) +{ + return test_bit(5, &status); +} + +static inline int +iop_is_err_int_tabort(unsigned long status, struct iop_adma_chan *chan) +{ + return test_bit(4, &status); +} + +static inline int +iop_is_err_int_mabort(unsigned long status, struct iop_adma_chan *chan) +{ + return test_bit(3, &status); +} + +static inline int +iop_is_err_pci_tabort(unsigned long status, struct iop_adma_chan *chan) +{ + return 0; +} + +static inline int +iop_is_err_pci_mabort(unsigned long status, struct iop_adma_chan *chan) +{ + return 0; +} + +static inline int +iop_is_err_split_tx(unsigned long status, struct iop_adma_chan *chan) +{ + return 0; +} + +#endif /* _ADMA_H */ diff --git a/include/asm-arm/arch-iop13xx/iop13xx.h b/include/asm-arm/arch-iop13xx/iop13xx.h index e6736c3d1f7f..d4e4f828577c 100644 --- a/include/asm-arm/arch-iop13xx/iop13xx.h +++ b/include/asm-arm/arch-iop13xx/iop13xx.h @@ -166,12 +166,22 @@ static inline int iop13xx_cpu_id(void) #define IOP13XX_INIT_I2C_1 (1 << 1) #define IOP13XX_INIT_I2C_2 (1 << 2) -#define IQ81340_NUM_UART 2 -#define IQ81340_NUM_I2C 3 -#define IQ81340_NUM_PHYS_MAP_FLASH 1 -#define IQ81340_MAX_PLAT_DEVICES (IQ81340_NUM_UART +\ - IQ81340_NUM_I2C +\ - IQ81340_NUM_PHYS_MAP_FLASH) +/* ADMA selection flags */ +/* INIT_ADMA_DEFAULT = Rely on CONFIG_IOP13XX_ADMA* */ +#define IOP13XX_INIT_ADMA_DEFAULT (0) +#define IOP13XX_INIT_ADMA_0 (1 << 0) +#define IOP13XX_INIT_ADMA_1 (1 << 1) +#define IOP13XX_INIT_ADMA_2 (1 << 2) + +/* Platform devices */ +#define IQ81340_NUM_UART 2 +#define IQ81340_NUM_I2C 3 +#define IQ81340_NUM_PHYS_MAP_FLASH 1 +#define IQ81340_NUM_ADMA 3 +#define IQ81340_MAX_PLAT_DEVICES (IQ81340_NUM_UART + \ + IQ81340_NUM_I2C + \ + IQ81340_NUM_PHYS_MAP_FLASH + \ + IQ81340_NUM_ADMA) /*========================== PMMR offsets for key registers ============*/ #define IOP13XX_ATU0_PMMR_OFFSET 0x00048000 @@ -444,22 +454,6 @@ static inline int iop13xx_cpu_id(void) /*==============================ADMA UNITS===============================*/ #define IOP13XX_ADMA_PHYS_BASE(chan) IOP13XX_REG_ADDR32_PHYS((chan << 9)) #define IOP13XX_ADMA_UPPER_PA(chan) (IOP13XX_ADMA_PHYS_BASE(chan) + 0xc0) -#define IOP13XX_ADMA_OFFSET(chan, ofs) IOP13XX_REG_ADDR32((chan << 9) + (ofs)) - -#define IOP13XX_ADMA_ACCR(chan) IOP13XX_ADMA_OFFSET(chan, 0x0) -#define IOP13XX_ADMA_ACSR(chan) IOP13XX_ADMA_OFFSET(chan, 0x4) -#define IOP13XX_ADMA_ADAR(chan) IOP13XX_ADMA_OFFSET(chan, 0x8) -#define IOP13XX_ADMA_IIPCR(chan) IOP13XX_ADMA_OFFSET(chan, 0x18) -#define IOP13XX_ADMA_IIPAR(chan) IOP13XX_ADMA_OFFSET(chan, 0x1c) -#define IOP13XX_ADMA_IIPUAR(chan) IOP13XX_ADMA_OFFSET(chan, 0x20) -#define IOP13XX_ADMA_ANDAR(chan) IOP13XX_ADMA_OFFSET(chan, 0x24) -#define IOP13XX_ADMA_ADCR(chan) IOP13XX_ADMA_OFFSET(chan, 0x28) -#define IOP13XX_ADMA_CARMD(chan) IOP13XX_ADMA_OFFSET(chan, 0x2c) -#define IOP13XX_ADMA_ABCR(chan) IOP13XX_ADMA_OFFSET(chan, 0x30) -#define IOP13XX_ADMA_DLADR(chan) IOP13XX_ADMA_OFFSET(chan, 0x34) -#define IOP13XX_ADMA_DUADR(chan) IOP13XX_ADMA_OFFSET(chan, 0x38) -#define IOP13XX_ADMA_SLAR(src, chan) IOP13XX_ADMA_OFFSET(chan, 0x3c + (src <<3)) -#define IOP13XX_ADMA_SUAR(src, chan) IOP13XX_ADMA_OFFSET(chan, 0x40 + (src <<3)) /*==============================XSI BRIDGE===============================*/ #define IOP13XX_XBG_BECSR IOP13XX_REG_ADDR32(0x178c) diff --git a/include/asm-arm/arch-iop32x/adma.h b/include/asm-arm/arch-iop32x/adma.h new file mode 100644 index 000000000000..5ed92037dd10 --- /dev/null +++ b/include/asm-arm/arch-iop32x/adma.h @@ -0,0 +1,5 @@ +#ifndef IOP32X_ADMA_H +#define IOP32X_ADMA_H +#include +#endif + diff --git a/include/asm-arm/arch-iop33x/adma.h b/include/asm-arm/arch-iop33x/adma.h new file mode 100644 index 000000000000..4b92f795f90e --- /dev/null +++ b/include/asm-arm/arch-iop33x/adma.h @@ -0,0 +1,5 @@ +#ifndef IOP33X_ADMA_H +#define IOP33X_ADMA_H +#include +#endif + diff --git a/include/asm-arm/hardware/iop3xx-adma.h b/include/asm-arm/hardware/iop3xx-adma.h new file mode 100644 index 000000000000..10834b54f681 --- /dev/null +++ b/include/asm-arm/hardware/iop3xx-adma.h @@ -0,0 +1,892 @@ +/* + * Copyright © 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef _ADMA_H +#define _ADMA_H +#include +#include +#include +#include + +/* Memory copy units */ +#define DMA_CCR(chan) (chan->mmr_base + 0x0) +#define DMA_CSR(chan) (chan->mmr_base + 0x4) +#define DMA_DAR(chan) (chan->mmr_base + 0xc) +#define DMA_NDAR(chan) (chan->mmr_base + 0x10) +#define DMA_PADR(chan) (chan->mmr_base + 0x14) +#define DMA_PUADR(chan) (chan->mmr_base + 0x18) +#define DMA_LADR(chan) (chan->mmr_base + 0x1c) +#define DMA_BCR(chan) (chan->mmr_base + 0x20) +#define DMA_DCR(chan) (chan->mmr_base + 0x24) + +/* Application accelerator unit */ +#define AAU_ACR(chan) (chan->mmr_base + 0x0) +#define AAU_ASR(chan) (chan->mmr_base + 0x4) +#define AAU_ADAR(chan) (chan->mmr_base + 0x8) +#define AAU_ANDAR(chan) (chan->mmr_base + 0xc) +#define AAU_SAR(src, chan) (chan->mmr_base + (0x10 + ((src) << 2))) +#define AAU_DAR(chan) (chan->mmr_base + 0x20) +#define AAU_ABCR(chan) (chan->mmr_base + 0x24) +#define AAU_ADCR(chan) (chan->mmr_base + 0x28) +#define AAU_SAR_EDCR(src_edc) (chan->mmr_base + (0x02c + ((src_edc-4) << 2))) +#define AAU_EDCR0_IDX 8 +#define AAU_EDCR1_IDX 17 +#define AAU_EDCR2_IDX 26 + +#define DMA0_ID 0 +#define DMA1_ID 1 +#define AAU_ID 2 + +struct iop3xx_aau_desc_ctrl { + unsigned int int_en:1; + unsigned int blk1_cmd_ctrl:3; + unsigned int blk2_cmd_ctrl:3; + unsigned int blk3_cmd_ctrl:3; + unsigned int blk4_cmd_ctrl:3; + unsigned int blk5_cmd_ctrl:3; + unsigned int blk6_cmd_ctrl:3; + unsigned int blk7_cmd_ctrl:3; + unsigned int blk8_cmd_ctrl:3; + unsigned int blk_ctrl:2; + unsigned int dual_xor_en:1; + unsigned int tx_complete:1; + unsigned int zero_result_err:1; + unsigned int zero_result_en:1; + unsigned int dest_write_en:1; +}; + +struct iop3xx_aau_e_desc_ctrl { + unsigned int reserved:1; + unsigned int blk1_cmd_ctrl:3; + unsigned int blk2_cmd_ctrl:3; + unsigned int blk3_cmd_ctrl:3; + unsigned int blk4_cmd_ctrl:3; + unsigned int blk5_cmd_ctrl:3; + unsigned int blk6_cmd_ctrl:3; + unsigned int blk7_cmd_ctrl:3; + unsigned int blk8_cmd_ctrl:3; + unsigned int reserved2:7; +}; + +struct iop3xx_dma_desc_ctrl { + unsigned int pci_transaction:4; + unsigned int int_en:1; + unsigned int dac_cycle_en:1; + unsigned int mem_to_mem_en:1; + unsigned int crc_data_tx_en:1; + unsigned int crc_gen_en:1; + unsigned int crc_seed_dis:1; + unsigned int reserved:21; + unsigned int crc_tx_complete:1; +}; + +struct iop3xx_desc_dma { + u32 next_desc; + union { + u32 pci_src_addr; + u32 pci_dest_addr; + u32 src_addr; + }; + union { + u32 upper_pci_src_addr; + u32 upper_pci_dest_addr; + }; + union { + u32 local_pci_src_addr; + u32 local_pci_dest_addr; + u32 dest_addr; + }; + u32 byte_count; + union { + u32 desc_ctrl; + struct iop3xx_dma_desc_ctrl desc_ctrl_field; + }; + u32 crc_addr; +}; + +struct iop3xx_desc_aau { + u32 next_desc; + u32 src[4]; + u32 dest_addr; + u32 byte_count; + union { + u32 desc_ctrl; + struct iop3xx_aau_desc_ctrl desc_ctrl_field; + }; + union { + u32 src_addr; + u32 e_desc_ctrl; + struct iop3xx_aau_e_desc_ctrl e_desc_ctrl_field; + } src_edc[31]; +}; + +struct iop3xx_aau_gfmr { + unsigned int gfmr1:8; + unsigned int gfmr2:8; + unsigned int gfmr3:8; + unsigned int gfmr4:8; +}; + +struct iop3xx_desc_pq_xor { + u32 next_desc; + u32 src[3]; + union { + u32 data_mult1; + struct iop3xx_aau_gfmr data_mult1_field; + }; + u32 dest_addr; + u32 byte_count; + union { + u32 desc_ctrl; + struct iop3xx_aau_desc_ctrl desc_ctrl_field; + }; + union { + u32 src_addr; + u32 e_desc_ctrl; + struct iop3xx_aau_e_desc_ctrl e_desc_ctrl_field; + u32 data_multiplier; + struct iop3xx_aau_gfmr data_mult_field; + u32 reserved; + } src_edc_gfmr[19]; +}; + +struct iop3xx_desc_dual_xor { + u32 next_desc; + u32 src0_addr; + u32 src1_addr; + u32 h_src_addr; + u32 d_src_addr; + u32 h_dest_addr; + u32 byte_count; + union { + u32 desc_ctrl; + struct iop3xx_aau_desc_ctrl desc_ctrl_field; + }; + u32 d_dest_addr; +}; + +union iop3xx_desc { + struct iop3xx_desc_aau *aau; + struct iop3xx_desc_dma *dma; + struct iop3xx_desc_pq_xor *pq_xor; + struct iop3xx_desc_dual_xor *dual_xor; + void *ptr; +}; + +static inline int iop_adma_get_max_xor(void) +{ + return 32; +} + +static inline u32 iop_chan_get_current_descriptor(struct iop_adma_chan *chan) +{ + int id = chan->device->id; + + switch (id) { + case DMA0_ID: + case DMA1_ID: + return __raw_readl(DMA_DAR(chan)); + case AAU_ID: + return __raw_readl(AAU_ADAR(chan)); + default: + BUG(); + } + return 0; +} + +static inline void iop_chan_set_next_descriptor(struct iop_adma_chan *chan, + u32 next_desc_addr) +{ + int id = chan->device->id; + + switch (id) { + case DMA0_ID: + case DMA1_ID: + __raw_writel(next_desc_addr, DMA_NDAR(chan)); + break; + case AAU_ID: + __raw_writel(next_desc_addr, AAU_ANDAR(chan)); + break; + } + +} + +#define IOP_ADMA_STATUS_BUSY (1 << 10) +#define IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT (1024) +#define IOP_ADMA_XOR_MAX_BYTE_COUNT (16 * 1024 * 1024) +#define IOP_ADMA_MAX_BYTE_COUNT (16 * 1024 * 1024) + +static inline int iop_chan_is_busy(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(DMA_CSR(chan)); + return (status & IOP_ADMA_STATUS_BUSY) ? 1 : 0; +} + +static inline int iop_desc_is_aligned(struct iop_adma_desc_slot *desc, + int num_slots) +{ + /* num_slots will only ever be 1, 2, 4, or 8 */ + return (desc->idx & (num_slots - 1)) ? 0 : 1; +} + +/* to do: support large (i.e. > hw max) buffer sizes */ +static inline int iop_chan_memcpy_slot_count(size_t len, int *slots_per_op) +{ + *slots_per_op = 1; + return 1; +} + +/* to do: support large (i.e. > hw max) buffer sizes */ +static inline int iop_chan_memset_slot_count(size_t len, int *slots_per_op) +{ + *slots_per_op = 1; + return 1; +} + +static inline int iop3xx_aau_xor_slot_count(size_t len, int src_cnt, + int *slots_per_op) +{ + const static int slot_count_table[] = { 0, + 1, 1, 1, 1, /* 01 - 04 */ + 2, 2, 2, 2, /* 05 - 08 */ + 4, 4, 4, 4, /* 09 - 12 */ + 4, 4, 4, 4, /* 13 - 16 */ + 8, 8, 8, 8, /* 17 - 20 */ + 8, 8, 8, 8, /* 21 - 24 */ + 8, 8, 8, 8, /* 25 - 28 */ + 8, 8, 8, 8, /* 29 - 32 */ + }; + *slots_per_op = slot_count_table[src_cnt]; + return *slots_per_op; +} + +static inline int +iop_chan_interrupt_slot_count(int *slots_per_op, struct iop_adma_chan *chan) +{ + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return iop_chan_memcpy_slot_count(0, slots_per_op); + case AAU_ID: + return iop3xx_aau_xor_slot_count(0, 2, slots_per_op); + default: + BUG(); + } + return 0; +} + +static inline int iop_chan_xor_slot_count(size_t len, int src_cnt, + int *slots_per_op) +{ + int slot_cnt = iop3xx_aau_xor_slot_count(len, src_cnt, slots_per_op); + + if (len <= IOP_ADMA_XOR_MAX_BYTE_COUNT) + return slot_cnt; + + len -= IOP_ADMA_XOR_MAX_BYTE_COUNT; + while (len > IOP_ADMA_XOR_MAX_BYTE_COUNT) { + len -= IOP_ADMA_XOR_MAX_BYTE_COUNT; + slot_cnt += *slots_per_op; + } + + if (len) + slot_cnt += *slots_per_op; + + return slot_cnt; +} + +/* zero sum on iop3xx is limited to 1k at a time so it requires multiple + * descriptors + */ +static inline int iop_chan_zero_sum_slot_count(size_t len, int src_cnt, + int *slots_per_op) +{ + int slot_cnt = iop3xx_aau_xor_slot_count(len, src_cnt, slots_per_op); + + if (len <= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT) + return slot_cnt; + + len -= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT; + while (len > IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT) { + len -= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT; + slot_cnt += *slots_per_op; + } + + if (len) + slot_cnt += *slots_per_op; + + return slot_cnt; +} + +static inline u32 iop_desc_get_dest_addr(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return hw_desc.dma->dest_addr; + case AAU_ID: + return hw_desc.aau->dest_addr; + default: + BUG(); + } + return 0; +} + +static inline u32 iop_desc_get_byte_count(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return hw_desc.dma->byte_count; + case AAU_ID: + return hw_desc.aau->byte_count; + default: + BUG(); + } + return 0; +} + +/* translate the src_idx to a descriptor word index */ +static inline int __desc_idx(int src_idx) +{ + const static int desc_idx_table[] = { 0, 0, 0, 0, + 0, 1, 2, 3, + 5, 6, 7, 8, + 9, 10, 11, 12, + 14, 15, 16, 17, + 18, 19, 20, 21, + 23, 24, 25, 26, + 27, 28, 29, 30, + }; + + return desc_idx_table[src_idx]; +} + +static inline u32 iop_desc_get_src_addr(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan, + int src_idx) +{ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return hw_desc.dma->src_addr; + case AAU_ID: + break; + default: + BUG(); + } + + if (src_idx < 4) + return hw_desc.aau->src[src_idx]; + else + return hw_desc.aau->src_edc[__desc_idx(src_idx)].src_addr; +} + +static inline void iop3xx_aau_desc_set_src_addr(struct iop3xx_desc_aau *hw_desc, + int src_idx, dma_addr_t addr) +{ + if (src_idx < 4) + hw_desc->src[src_idx] = addr; + else + hw_desc->src_edc[__desc_idx(src_idx)].src_addr = addr; +} + +static inline void +iop_desc_init_memcpy(struct iop_adma_desc_slot *desc, int int_en) +{ + struct iop3xx_desc_dma *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop3xx_dma_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + u_desc_ctrl.field.mem_to_mem_en = 1; + u_desc_ctrl.field.pci_transaction = 0xe; /* memory read block */ + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; + hw_desc->upper_pci_src_addr = 0; + hw_desc->crc_addr = 0; +} + +static inline void +iop_desc_init_memset(struct iop_adma_desc_slot *desc, int int_en) +{ + struct iop3xx_desc_aau *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop3xx_aau_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + u_desc_ctrl.field.blk1_cmd_ctrl = 0x2; /* memory block fill */ + u_desc_ctrl.field.dest_write_en = 1; + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; +} + +static inline u32 +iop3xx_desc_init_xor(struct iop3xx_desc_aau *hw_desc, int src_cnt, int int_en) +{ + int i, shift; + u32 edcr; + union { + u32 value; + struct iop3xx_aau_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + switch (src_cnt) { + case 25 ... 32: + u_desc_ctrl.field.blk_ctrl = 0x3; /* use EDCR[2:0] */ + edcr = 0; + shift = 1; + for (i = 24; i < src_cnt; i++) { + edcr |= (1 << shift); + shift += 3; + } + hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = edcr; + src_cnt = 24; + /* fall through */ + case 17 ... 24: + if (!u_desc_ctrl.field.blk_ctrl) { + hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = 0; + u_desc_ctrl.field.blk_ctrl = 0x3; /* use EDCR[2:0] */ + } + edcr = 0; + shift = 1; + for (i = 16; i < src_cnt; i++) { + edcr |= (1 << shift); + shift += 3; + } + hw_desc->src_edc[AAU_EDCR1_IDX].e_desc_ctrl = edcr; + src_cnt = 16; + /* fall through */ + case 9 ... 16: + if (!u_desc_ctrl.field.blk_ctrl) + u_desc_ctrl.field.blk_ctrl = 0x2; /* use EDCR0 */ + edcr = 0; + shift = 1; + for (i = 8; i < src_cnt; i++) { + edcr |= (1 << shift); + shift += 3; + } + hw_desc->src_edc[AAU_EDCR0_IDX].e_desc_ctrl = edcr; + src_cnt = 8; + /* fall through */ + case 2 ... 8: + shift = 1; + for (i = 0; i < src_cnt; i++) { + u_desc_ctrl.value |= (1 << shift); + shift += 3; + } + + if (!u_desc_ctrl.field.blk_ctrl && src_cnt > 4) + u_desc_ctrl.field.blk_ctrl = 0x1; /* use mini-desc */ + } + + u_desc_ctrl.field.dest_write_en = 1; + u_desc_ctrl.field.blk1_cmd_ctrl = 0x7; /* direct fill */ + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; + + return u_desc_ctrl.value; +} + +static inline void +iop_desc_init_xor(struct iop_adma_desc_slot *desc, int src_cnt, int int_en) +{ + iop3xx_desc_init_xor(desc->hw_desc, src_cnt, int_en); +} + +/* return the number of operations */ +static inline int +iop_desc_init_zero_sum(struct iop_adma_desc_slot *desc, int src_cnt, int int_en) +{ + int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op; + struct iop3xx_desc_aau *hw_desc, *prev_hw_desc, *iter; + union { + u32 value; + struct iop3xx_aau_desc_ctrl field; + } u_desc_ctrl; + int i, j; + + hw_desc = desc->hw_desc; + + for (i = 0, j = 0; (slot_cnt -= slots_per_op) >= 0; + i += slots_per_op, j++) { + iter = iop_hw_desc_slot_idx(hw_desc, i); + u_desc_ctrl.value = iop3xx_desc_init_xor(iter, src_cnt, int_en); + u_desc_ctrl.field.dest_write_en = 0; + u_desc_ctrl.field.zero_result_en = 1; + u_desc_ctrl.field.int_en = int_en; + iter->desc_ctrl = u_desc_ctrl.value; + + /* for the subsequent descriptors preserve the store queue + * and chain them together + */ + if (i) { + prev_hw_desc = + iop_hw_desc_slot_idx(hw_desc, i - slots_per_op); + prev_hw_desc->next_desc = + (u32) (desc->async_tx.phys + (i << 5)); + } + } + + return j; +} + +static inline void +iop_desc_init_null_xor(struct iop_adma_desc_slot *desc, int src_cnt, int int_en) +{ + struct iop3xx_desc_aau *hw_desc = desc->hw_desc; + union { + u32 value; + struct iop3xx_aau_desc_ctrl field; + } u_desc_ctrl; + + u_desc_ctrl.value = 0; + switch (src_cnt) { + case 25 ... 32: + u_desc_ctrl.field.blk_ctrl = 0x3; /* use EDCR[2:0] */ + hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = 0; + /* fall through */ + case 17 ... 24: + if (!u_desc_ctrl.field.blk_ctrl) { + hw_desc->src_edc[AAU_EDCR2_IDX].e_desc_ctrl = 0; + u_desc_ctrl.field.blk_ctrl = 0x3; /* use EDCR[2:0] */ + } + hw_desc->src_edc[AAU_EDCR1_IDX].e_desc_ctrl = 0; + /* fall through */ + case 9 ... 16: + if (!u_desc_ctrl.field.blk_ctrl) + u_desc_ctrl.field.blk_ctrl = 0x2; /* use EDCR0 */ + hw_desc->src_edc[AAU_EDCR0_IDX].e_desc_ctrl = 0; + /* fall through */ + case 1 ... 8: + if (!u_desc_ctrl.field.blk_ctrl && src_cnt > 4) + u_desc_ctrl.field.blk_ctrl = 0x1; /* use mini-desc */ + } + + u_desc_ctrl.field.dest_write_en = 0; + u_desc_ctrl.field.int_en = int_en; + hw_desc->desc_ctrl = u_desc_ctrl.value; +} + +static inline void iop_desc_set_byte_count(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan, + u32 byte_count) +{ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + hw_desc.dma->byte_count = byte_count; + break; + case AAU_ID: + hw_desc.aau->byte_count = byte_count; + break; + default: + BUG(); + } +} + +static inline void +iop_desc_init_interrupt(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan) +{ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + iop_desc_init_memcpy(desc, 1); + hw_desc.dma->byte_count = 0; + hw_desc.dma->dest_addr = 0; + hw_desc.dma->src_addr = 0; + break; + case AAU_ID: + iop_desc_init_null_xor(desc, 2, 1); + hw_desc.aau->byte_count = 0; + hw_desc.aau->dest_addr = 0; + hw_desc.aau->src[0] = 0; + hw_desc.aau->src[1] = 0; + break; + default: + BUG(); + } +} + +static inline void +iop_desc_set_zero_sum_byte_count(struct iop_adma_desc_slot *desc, u32 len) +{ + int slots_per_op = desc->slots_per_op; + struct iop3xx_desc_aau *hw_desc = desc->hw_desc, *iter; + int i = 0; + + if (len <= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT) { + hw_desc->byte_count = len; + } else { + do { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iter->byte_count = IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT; + len -= IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT; + i += slots_per_op; + } while (len > IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT); + + if (len) { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iter->byte_count = len; + } + } +} + +static inline void iop_desc_set_dest_addr(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *chan, + dma_addr_t addr) +{ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + hw_desc.dma->dest_addr = addr; + break; + case AAU_ID: + hw_desc.aau->dest_addr = addr; + break; + default: + BUG(); + } +} + +static inline void iop_desc_set_memcpy_src_addr(struct iop_adma_desc_slot *desc, + dma_addr_t addr) +{ + struct iop3xx_desc_dma *hw_desc = desc->hw_desc; + hw_desc->src_addr = addr; +} + +static inline void +iop_desc_set_zero_sum_src_addr(struct iop_adma_desc_slot *desc, int src_idx, + dma_addr_t addr) +{ + + struct iop3xx_desc_aau *hw_desc = desc->hw_desc, *iter; + int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op; + int i; + + for (i = 0; (slot_cnt -= slots_per_op) >= 0; + i += slots_per_op, addr += IOP_ADMA_ZERO_SUM_MAX_BYTE_COUNT) { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iop3xx_aau_desc_set_src_addr(iter, src_idx, addr); + } +} + +static inline void iop_desc_set_xor_src_addr(struct iop_adma_desc_slot *desc, + int src_idx, dma_addr_t addr) +{ + + struct iop3xx_desc_aau *hw_desc = desc->hw_desc, *iter; + int slot_cnt = desc->slot_cnt, slots_per_op = desc->slots_per_op; + int i; + + for (i = 0; (slot_cnt -= slots_per_op) >= 0; + i += slots_per_op, addr += IOP_ADMA_XOR_MAX_BYTE_COUNT) { + iter = iop_hw_desc_slot_idx(hw_desc, i); + iop3xx_aau_desc_set_src_addr(iter, src_idx, addr); + } +} + +static inline void iop_desc_set_next_desc(struct iop_adma_desc_slot *desc, + u32 next_desc_addr) +{ + /* hw_desc->next_desc is the same location for all channels */ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + BUG_ON(hw_desc.dma->next_desc); + hw_desc.dma->next_desc = next_desc_addr; +} + +static inline u32 iop_desc_get_next_desc(struct iop_adma_desc_slot *desc) +{ + /* hw_desc->next_desc is the same location for all channels */ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + return hw_desc.dma->next_desc; +} + +static inline void iop_desc_clear_next_desc(struct iop_adma_desc_slot *desc) +{ + /* hw_desc->next_desc is the same location for all channels */ + union iop3xx_desc hw_desc = { .ptr = desc->hw_desc, }; + hw_desc.dma->next_desc = 0; +} + +static inline void iop_desc_set_block_fill_val(struct iop_adma_desc_slot *desc, + u32 val) +{ + struct iop3xx_desc_aau *hw_desc = desc->hw_desc; + hw_desc->src[0] = val; +} + +static inline int iop_desc_get_zero_result(struct iop_adma_desc_slot *desc) +{ + struct iop3xx_desc_aau *hw_desc = desc->hw_desc; + struct iop3xx_aau_desc_ctrl desc_ctrl = hw_desc->desc_ctrl_field; + + BUG_ON(!(desc_ctrl.tx_complete && desc_ctrl.zero_result_en)); + return desc_ctrl.zero_result_err; +} + +static inline void iop_chan_append(struct iop_adma_chan *chan) +{ + u32 dma_chan_ctrl; + /* workaround dropped interrupts on 3xx */ + mod_timer(&chan->cleanup_watchdog, jiffies + msecs_to_jiffies(3)); + + dma_chan_ctrl = __raw_readl(DMA_CCR(chan)); + dma_chan_ctrl |= 0x2; + __raw_writel(dma_chan_ctrl, DMA_CCR(chan)); +} + +static inline void iop_chan_idle(int busy, struct iop_adma_chan *chan) +{ + if (!busy) + del_timer(&chan->cleanup_watchdog); +} + +static inline u32 iop_chan_get_status(struct iop_adma_chan *chan) +{ + return __raw_readl(DMA_CSR(chan)); +} + +static inline void iop_chan_disable(struct iop_adma_chan *chan) +{ + u32 dma_chan_ctrl = __raw_readl(DMA_CCR(chan)); + dma_chan_ctrl &= ~1; + __raw_writel(dma_chan_ctrl, DMA_CCR(chan)); +} + +static inline void iop_chan_enable(struct iop_adma_chan *chan) +{ + u32 dma_chan_ctrl = __raw_readl(DMA_CCR(chan)); + + dma_chan_ctrl |= 1; + __raw_writel(dma_chan_ctrl, DMA_CCR(chan)); +} + +static inline void iop_adma_device_clear_eot_status(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(DMA_CSR(chan)); + status &= (1 << 9); + __raw_writel(status, DMA_CSR(chan)); +} + +static inline void iop_adma_device_clear_eoc_status(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(DMA_CSR(chan)); + status &= (1 << 8); + __raw_writel(status, DMA_CSR(chan)); +} + +static inline void iop_adma_device_clear_err_status(struct iop_adma_chan *chan) +{ + u32 status = __raw_readl(DMA_CSR(chan)); + + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + status &= (1 << 5) | (1 << 3) | (1 << 2) | (1 << 1); + break; + case AAU_ID: + status &= (1 << 5); + break; + default: + BUG(); + } + + __raw_writel(status, DMA_CSR(chan)); +} + +static inline int +iop_is_err_int_parity(unsigned long status, struct iop_adma_chan *chan) +{ + return 0; +} + +static inline int +iop_is_err_mcu_abort(unsigned long status, struct iop_adma_chan *chan) +{ + return 0; +} + +static inline int +iop_is_err_int_tabort(unsigned long status, struct iop_adma_chan *chan) +{ + return 0; +} + +static inline int +iop_is_err_int_mabort(unsigned long status, struct iop_adma_chan *chan) +{ + return test_bit(5, &status); +} + +static inline int +iop_is_err_pci_tabort(unsigned long status, struct iop_adma_chan *chan) +{ + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return test_bit(2, &status); + default: + return 0; + } +} + +static inline int +iop_is_err_pci_mabort(unsigned long status, struct iop_adma_chan *chan) +{ + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return test_bit(3, &status); + default: + return 0; + } +} + +static inline int +iop_is_err_split_tx(unsigned long status, struct iop_adma_chan *chan) +{ + switch (chan->device->id) { + case DMA0_ID: + case DMA1_ID: + return test_bit(1, &status); + default: + return 0; + } +} +#endif /* _ADMA_H */ diff --git a/include/asm-arm/hardware/iop3xx.h b/include/asm-arm/hardware/iop3xx.h index 63feceb7ede5..81ca5d3e2bff 100644 --- a/include/asm-arm/hardware/iop3xx.h +++ b/include/asm-arm/hardware/iop3xx.h @@ -144,24 +144,9 @@ extern int init_atu; #define IOP3XX_IAR (volatile u32 *)IOP3XX_REG_ADDR(0x0380) /* DMA Controller */ -#define IOP3XX_DMA0_CCR (volatile u32 *)IOP3XX_REG_ADDR(0x0400) -#define IOP3XX_DMA0_CSR (volatile u32 *)IOP3XX_REG_ADDR(0x0404) -#define IOP3XX_DMA0_DAR (volatile u32 *)IOP3XX_REG_ADDR(0x040c) -#define IOP3XX_DMA0_NDAR (volatile u32 *)IOP3XX_REG_ADDR(0x0410) -#define IOP3XX_DMA0_PADR (volatile u32 *)IOP3XX_REG_ADDR(0x0414) -#define IOP3XX_DMA0_PUADR (volatile u32 *)IOP3XX_REG_ADDR(0x0418) -#define IOP3XX_DMA0_LADR (volatile u32 *)IOP3XX_REG_ADDR(0x041c) -#define IOP3XX_DMA0_BCR (volatile u32 *)IOP3XX_REG_ADDR(0x0420) -#define IOP3XX_DMA0_DCR (volatile u32 *)IOP3XX_REG_ADDR(0x0424) -#define IOP3XX_DMA1_CCR (volatile u32 *)IOP3XX_REG_ADDR(0x0440) -#define IOP3XX_DMA1_CSR (volatile u32 *)IOP3XX_REG_ADDR(0x0444) -#define IOP3XX_DMA1_DAR (volatile u32 *)IOP3XX_REG_ADDR(0x044c) -#define IOP3XX_DMA1_NDAR (volatile u32 *)IOP3XX_REG_ADDR(0x0450) -#define IOP3XX_DMA1_PADR (volatile u32 *)IOP3XX_REG_ADDR(0x0454) -#define IOP3XX_DMA1_PUADR (volatile u32 *)IOP3XX_REG_ADDR(0x0458) -#define IOP3XX_DMA1_LADR (volatile u32 *)IOP3XX_REG_ADDR(0x045c) -#define IOP3XX_DMA1_BCR (volatile u32 *)IOP3XX_REG_ADDR(0x0460) -#define IOP3XX_DMA1_DCR (volatile u32 *)IOP3XX_REG_ADDR(0x0464) +#define IOP3XX_DMA_PHYS_BASE(chan) (IOP3XX_PERIPHERAL_PHYS_BASE + \ + (0x400 + (chan << 6))) +#define IOP3XX_DMA_UPPER_PA(chan) (IOP3XX_DMA_PHYS_BASE(chan) + 0x27) /* Peripheral bus interface */ #define IOP3XX_PBCR (volatile u32 *)IOP3XX_REG_ADDR(0x0680) @@ -210,48 +195,8 @@ extern int init_atu; #define IOP_TMR_RATIO_1_1 0x00 /* Application accelerator unit */ -#define IOP3XX_AAU_ACR (volatile u32 *)IOP3XX_REG_ADDR(0x0800) -#define IOP3XX_AAU_ASR (volatile u32 *)IOP3XX_REG_ADDR(0x0804) -#define IOP3XX_AAU_ADAR (volatile u32 *)IOP3XX_REG_ADDR(0x0808) -#define IOP3XX_AAU_ANDAR (volatile u32 *)IOP3XX_REG_ADDR(0x080c) -#define IOP3XX_AAU_SAR1 (volatile u32 *)IOP3XX_REG_ADDR(0x0810) -#define IOP3XX_AAU_SAR2 (volatile u32 *)IOP3XX_REG_ADDR(0x0814) -#define IOP3XX_AAU_SAR3 (volatile u32 *)IOP3XX_REG_ADDR(0x0818) -#define IOP3XX_AAU_SAR4 (volatile u32 *)IOP3XX_REG_ADDR(0x081c) -#define IOP3XX_AAU_DAR (volatile u32 *)IOP3XX_REG_ADDR(0x0820) -#define IOP3XX_AAU_ABCR (volatile u32 *)IOP3XX_REG_ADDR(0x0824) -#define IOP3XX_AAU_ADCR (volatile u32 *)IOP3XX_REG_ADDR(0x0828) -#define IOP3XX_AAU_SAR5 (volatile u32 *)IOP3XX_REG_ADDR(0x082c) -#define IOP3XX_AAU_SAR6 (volatile u32 *)IOP3XX_REG_ADDR(0x0830) -#define IOP3XX_AAU_SAR7 (volatile u32 *)IOP3XX_REG_ADDR(0x0834) -#define IOP3XX_AAU_SAR8 (volatile u32 *)IOP3XX_REG_ADDR(0x0838) -#define IOP3XX_AAU_EDCR0 (volatile u32 *)IOP3XX_REG_ADDR(0x083c) -#define IOP3XX_AAU_SAR9 (volatile u32 *)IOP3XX_REG_ADDR(0x0840) -#define IOP3XX_AAU_SAR10 (volatile u32 *)IOP3XX_REG_ADDR(0x0844) -#define IOP3XX_AAU_SAR11 (volatile u32 *)IOP3XX_REG_ADDR(0x0848) -#define IOP3XX_AAU_SAR12 (volatile u32 *)IOP3XX_REG_ADDR(0x084c) -#define IOP3XX_AAU_SAR13 (volatile u32 *)IOP3XX_REG_ADDR(0x0850) -#define IOP3XX_AAU_SAR14 (volatile u32 *)IOP3XX_REG_ADDR(0x0854) -#define IOP3XX_AAU_SAR15 (volatile u32 *)IOP3XX_REG_ADDR(0x0858) -#define IOP3XX_AAU_SAR16 (volatile u32 *)IOP3XX_REG_ADDR(0x085c) -#define IOP3XX_AAU_EDCR1 (volatile u32 *)IOP3XX_REG_ADDR(0x0860) -#define IOP3XX_AAU_SAR17 (volatile u32 *)IOP3XX_REG_ADDR(0x0864) -#define IOP3XX_AAU_SAR18 (volatile u32 *)IOP3XX_REG_ADDR(0x0868) -#define IOP3XX_AAU_SAR19 (volatile u32 *)IOP3XX_REG_ADDR(0x086c) -#define IOP3XX_AAU_SAR20 (volatile u32 *)IOP3XX_REG_ADDR(0x0870) -#define IOP3XX_AAU_SAR21 (volatile u32 *)IOP3XX_REG_ADDR(0x0874) -#define IOP3XX_AAU_SAR22 (volatile u32 *)IOP3XX_REG_ADDR(0x0878) -#define IOP3XX_AAU_SAR23 (volatile u32 *)IOP3XX_REG_ADDR(0x087c) -#define IOP3XX_AAU_SAR24 (volatile u32 *)IOP3XX_REG_ADDR(0x0880) -#define IOP3XX_AAU_EDCR2 (volatile u32 *)IOP3XX_REG_ADDR(0x0884) -#define IOP3XX_AAU_SAR25 (volatile u32 *)IOP3XX_REG_ADDR(0x0888) -#define IOP3XX_AAU_SAR26 (volatile u32 *)IOP3XX_REG_ADDR(0x088c) -#define IOP3XX_AAU_SAR27 (volatile u32 *)IOP3XX_REG_ADDR(0x0890) -#define IOP3XX_AAU_SAR28 (volatile u32 *)IOP3XX_REG_ADDR(0x0894) -#define IOP3XX_AAU_SAR29 (volatile u32 *)IOP3XX_REG_ADDR(0x0898) -#define IOP3XX_AAU_SAR30 (volatile u32 *)IOP3XX_REG_ADDR(0x089c) -#define IOP3XX_AAU_SAR31 (volatile u32 *)IOP3XX_REG_ADDR(0x08a0) -#define IOP3XX_AAU_SAR32 (volatile u32 *)IOP3XX_REG_ADDR(0x08a4) +#define IOP3XX_AAU_PHYS_BASE (IOP3XX_PERIPHERAL_PHYS_BASE + 0x800) +#define IOP3XX_AAU_UPPER_PA (IOP3XX_AAU_PHYS_BASE + 0xa7) /* I2C bus interface unit */ #define IOP3XX_ICR0 (volatile u32 *)IOP3XX_REG_ADDR(0x1680) @@ -329,6 +274,9 @@ static inline void write_tisr(u32 val) asm volatile("mcr p6, 0, %0, c6, c1, 0" : : "r" (val)); } +extern struct platform_device iop3xx_dma_0_channel; +extern struct platform_device iop3xx_dma_1_channel; +extern struct platform_device iop3xx_aau_channel; extern struct platform_device iop3xx_i2c0_device; extern struct platform_device iop3xx_i2c1_device; diff --git a/include/asm-arm/hardware/iop_adma.h b/include/asm-arm/hardware/iop_adma.h new file mode 100644 index 000000000000..ca8e71f44346 --- /dev/null +++ b/include/asm-arm/hardware/iop_adma.h @@ -0,0 +1,118 @@ +/* + * Copyright © 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef IOP_ADMA_H +#define IOP_ADMA_H +#include +#include +#include + +#define IOP_ADMA_SLOT_SIZE 32 +#define IOP_ADMA_THRESHOLD 4 + +/** + * struct iop_adma_device - internal representation of an ADMA device + * @pdev: Platform device + * @id: HW ADMA Device selector + * @dma_desc_pool: base of DMA descriptor region (DMA address) + * @dma_desc_pool_virt: base of DMA descriptor region (CPU address) + * @common: embedded struct dma_device + */ +struct iop_adma_device { + struct platform_device *pdev; + int id; + dma_addr_t dma_desc_pool; + void *dma_desc_pool_virt; + struct dma_device common; +}; + +/** + * struct iop_adma_chan - internal representation of an ADMA device + * @pending: allows batching of hardware operations + * @completed_cookie: identifier for the most recently completed operation + * @lock: serializes enqueue/dequeue operations to the slot pool + * @mmr_base: memory mapped register base + * @chain: device chain view of the descriptors + * @device: parent device + * @common: common dmaengine channel object members + * @last_used: place holder for allocation to continue from where it left off + * @all_slots: complete domain of slots usable by the channel + * @cleanup_watchdog: workaround missed interrupts on iop3xx + * @slots_allocated: records the actual size of the descriptor slot pool + * @irq_tasklet: bottom half where iop_adma_slot_cleanup runs + */ +struct iop_adma_chan { + int pending; + dma_cookie_t completed_cookie; + spinlock_t lock; /* protects the descriptor slot pool */ + void __iomem *mmr_base; + struct list_head chain; + struct iop_adma_device *device; + struct dma_chan common; + struct iop_adma_desc_slot *last_used; + struct list_head all_slots; + struct timer_list cleanup_watchdog; + int slots_allocated; + struct tasklet_struct irq_tasklet; +}; + +/** + * struct iop_adma_desc_slot - IOP-ADMA software descriptor + * @slot_node: node on the iop_adma_chan.all_slots list + * @chain_node: node on the op_adma_chan.chain list + * @hw_desc: virtual address of the hardware descriptor chain + * @phys: hardware address of the hardware descriptor chain + * @group_head: first operation in a transaction + * @slot_cnt: total slots used in an transaction (group of operations) + * @slots_per_op: number of slots per operation + * @idx: pool index + * @unmap_src_cnt: number of xor sources + * @unmap_len: transaction bytecount + * @async_tx: support for the async_tx api + * @group_list: list of slots that make up a multi-descriptor transaction + * for example transfer lengths larger than the supported hw max + * @xor_check_result: result of zero sum + * @crc32_result: result crc calculation + */ +struct iop_adma_desc_slot { + struct list_head slot_node; + struct list_head chain_node; + void *hw_desc; + struct iop_adma_desc_slot *group_head; + u16 slot_cnt; + u16 slots_per_op; + u16 idx; + u16 unmap_src_cnt; + size_t unmap_len; + struct dma_async_tx_descriptor async_tx; + union { + u32 *xor_check_result; + u32 *crc32_result; + }; +}; + +struct iop_adma_platform_data { + int hw_id; + dma_cap_mask_t cap_mask; + size_t pool_size; +}; + +#define to_iop_sw_desc(addr_hw_desc) \ + container_of(addr_hw_desc, struct iop_adma_desc_slot, hw_desc) +#define iop_hw_desc_slot_idx(hw_desc, idx) \ + ( (void *) (((unsigned long) hw_desc) + ((idx) << 5)) ) +#endif diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h new file mode 100644 index 000000000000..ff1255079fa1 --- /dev/null +++ b/include/linux/async_tx.h @@ -0,0 +1,156 @@ +/* + * Copyright © 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef _ASYNC_TX_H_ +#define _ASYNC_TX_H_ +#include +#include +#include + +/** + * dma_chan_ref - object used to manage dma channels received from the + * dmaengine core. + * @chan - the channel being tracked + * @node - node for the channel to be placed on async_tx_master_list + * @rcu - for list_del_rcu + * @count - number of times this channel is listed in the pool + * (for channels with multiple capabiities) + */ +struct dma_chan_ref { + struct dma_chan *chan; + struct list_head node; + struct rcu_head rcu; + atomic_t count; +}; + +/** + * async_tx_flags - modifiers for the async_* calls + * @ASYNC_TX_XOR_ZERO_DST: this flag must be used for xor operations where the + * the destination address is not a source. The asynchronous case handles this + * implicitly, the synchronous case needs to zero the destination block. + * @ASYNC_TX_XOR_DROP_DST: this flag must be used if the destination address is + * also one of the source addresses. In the synchronous case the destination + * address is an implied source, whereas the asynchronous case it must be listed + * as a source. The destination address must be the first address in the source + * array. + * @ASYNC_TX_ASSUME_COHERENT: skip cache maintenance operations + * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a + * dependency chain + * @ASYNC_TX_DEP_ACK: ack the dependency descriptor. Useful for chaining. + * @ASYNC_TX_KMAP_SRC: if the transaction is to be performed synchronously + * take an atomic mapping (KM_USER0) on the source page(s) + * @ASYNC_TX_KMAP_DST: if the transaction is to be performed synchronously + * take an atomic mapping (KM_USER0) on the dest page(s) + */ +enum async_tx_flags { + ASYNC_TX_XOR_ZERO_DST = (1 << 0), + ASYNC_TX_XOR_DROP_DST = (1 << 1), + ASYNC_TX_ASSUME_COHERENT = (1 << 2), + ASYNC_TX_ACK = (1 << 3), + ASYNC_TX_DEP_ACK = (1 << 4), + ASYNC_TX_KMAP_SRC = (1 << 5), + ASYNC_TX_KMAP_DST = (1 << 6), +}; + +#ifdef CONFIG_DMA_ENGINE +void async_tx_issue_pending_all(void); +enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); +void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx); +struct dma_chan * +async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, + enum dma_transaction_type tx_type); +#else +static inline void async_tx_issue_pending_all(void) +{ + do { } while (0); +} + +static inline enum dma_status +dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) +{ + return DMA_SUCCESS; +} + +static inline void +async_tx_run_dependencies(struct dma_async_tx_descriptor *tx, + struct dma_chan *host_chan) +{ + do { } while (0); +} + +static inline struct dma_chan * +async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, + enum dma_transaction_type tx_type) +{ + return NULL; +} +#endif + +/** + * async_tx_sync_epilog - actions to take if an operation is run synchronously + * @flags: async_tx flags + * @depend_tx: transaction depends on depend_tx + * @cb_fn: function to call when the transaction completes + * @cb_fn_param: parameter to pass to the callback routine + */ +static inline void +async_tx_sync_epilog(unsigned long flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param) +{ + if (cb_fn) + cb_fn(cb_fn_param); + + if (depend_tx && (flags & ASYNC_TX_DEP_ACK)) + async_tx_ack(depend_tx); +} + +void +async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, + enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); + +struct dma_async_tx_descriptor * +async_xor(struct page *dest, struct page **src_list, unsigned int offset, + int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); + +struct dma_async_tx_descriptor * +async_xor_zero_sum(struct page *dest, struct page **src_list, + unsigned int offset, int src_cnt, size_t len, + u32 *result, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); + +struct dma_async_tx_descriptor * +async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset, + unsigned int src_offset, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); + +struct dma_async_tx_descriptor * +async_memset(struct page *dest, int val, unsigned int offset, + size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); + +struct dma_async_tx_descriptor * +async_trigger_callback(enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback cb_fn, void *cb_fn_param); +#endif /* _ASYNC_TX_H_ */ diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index c94d8f1d62e5..a3b6035b6c86 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -21,28 +21,39 @@ #ifndef DMAENGINE_H #define DMAENGINE_H -#ifdef CONFIG_DMA_ENGINE - #include #include #include #include #include +#include /** - * enum dma_event - resource PNP/power managment events + * enum dma_state - resource PNP/power managment state * @DMA_RESOURCE_SUSPEND: DMA device going into low power state * @DMA_RESOURCE_RESUME: DMA device returning to full power - * @DMA_RESOURCE_ADDED: DMA device added to the system + * @DMA_RESOURCE_AVAILABLE: DMA device available to the system * @DMA_RESOURCE_REMOVED: DMA device removed from the system */ -enum dma_event { +enum dma_state { DMA_RESOURCE_SUSPEND, DMA_RESOURCE_RESUME, - DMA_RESOURCE_ADDED, + DMA_RESOURCE_AVAILABLE, DMA_RESOURCE_REMOVED, }; +/** + * enum dma_state_client - state of the channel in the client + * @DMA_ACK: client would like to use, or was using this channel + * @DMA_DUP: client has already seen this channel, or is not using this channel + * @DMA_NAK: client does not want to see any more channels + */ +enum dma_state_client { + DMA_ACK, + DMA_DUP, + DMA_NAK, +}; + /** * typedef dma_cookie_t - an opaque DMA cookie * @@ -64,6 +75,31 @@ enum dma_status { DMA_ERROR, }; +/** + * enum dma_transaction_type - DMA transaction types/indexes + */ +enum dma_transaction_type { + DMA_MEMCPY, + DMA_XOR, + DMA_PQ_XOR, + DMA_DUAL_XOR, + DMA_PQ_UPDATE, + DMA_ZERO_SUM, + DMA_PQ_ZERO_SUM, + DMA_MEMSET, + DMA_MEMCPY_CRC32C, + DMA_INTERRUPT, +}; + +/* last transaction type for creation of the capabilities mask */ +#define DMA_TX_TYPE_END (DMA_INTERRUPT + 1) + +/** + * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. + * See linux/cpumask.h + */ +typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; + /** * struct dma_chan_percpu - the per-CPU part of struct dma_chan * @refcount: local_t used for open-coded "bigref" counting @@ -80,7 +116,6 @@ struct dma_chan_percpu { /** * struct dma_chan - devices supply DMA channels, clients use them - * @client: ptr to the client user of this chan, will be %NULL when unused * @device: ptr to the dma device who supplies this channel, always !%NULL * @cookie: last cookie value returned to client * @chan_id: channel ID for sysfs @@ -88,12 +123,10 @@ struct dma_chan_percpu { * @refcount: kref, used in "bigref" slow-mode * @slow_ref: indicates that the DMA channel is free * @rcu: the DMA channel's RCU head - * @client_node: used to add this to the client chan list * @device_node: used to add this to the device chan list * @local: per-cpu pointer to a struct dma_chan_percpu */ struct dma_chan { - struct dma_client *client; struct dma_device *device; dma_cookie_t cookie; @@ -105,11 +138,11 @@ struct dma_chan { int slow_ref; struct rcu_head rcu; - struct list_head client_node; struct list_head device_node; struct dma_chan_percpu *local; }; + void dma_chan_cleanup(struct kref *kref); static inline void dma_chan_get(struct dma_chan *chan) @@ -134,169 +167,206 @@ static inline void dma_chan_put(struct dma_chan *chan) /* * typedef dma_event_callback - function pointer to a DMA event callback + * For each channel added to the system this routine is called for each client. + * If the client would like to use the channel it returns '1' to signal (ack) + * the dmaengine core to take out a reference on the channel and its + * corresponding device. A client must not 'ack' an available channel more + * than once. When a channel is removed all clients are notified. If a client + * is using the channel it must 'ack' the removal. A client must not 'ack' a + * removed channel more than once. + * @client - 'this' pointer for the client context + * @chan - channel to be acted upon + * @state - available or removed */ -typedef void (*dma_event_callback) (struct dma_client *client, - struct dma_chan *chan, enum dma_event event); +struct dma_client; +typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client, + struct dma_chan *chan, enum dma_state state); /** * struct dma_client - info on the entity making use of DMA services * @event_callback: func ptr to call when something happens - * @chan_count: number of chans allocated - * @chans_desired: number of chans requested. Can be +/- chan_count - * @lock: protects access to the channels list - * @channels: the list of DMA channels allocated + * @cap_mask: only return channels that satisfy the requested capabilities + * a value of zero corresponds to any capability * @global_node: list_head for global dma_client_list */ struct dma_client { dma_event_callback event_callback; - unsigned int chan_count; - unsigned int chans_desired; - - spinlock_t lock; - struct list_head channels; + dma_cap_mask_t cap_mask; struct list_head global_node; }; +typedef void (*dma_async_tx_callback)(void *dma_async_param); +/** + * struct dma_async_tx_descriptor - async transaction descriptor + * ---dma generic offload fields--- + * @cookie: tracking cookie for this transaction, set to -EBUSY if + * this tx is sitting on a dependency list + * @ack: the descriptor can not be reused until the client acknowledges + * receipt, i.e. has has a chance to establish any dependency chains + * @phys: physical address of the descriptor + * @tx_list: driver common field for operations that require multiple + * descriptors + * @chan: target channel for this operation + * @tx_submit: set the prepared descriptor(s) to be executed by the engine + * @tx_set_dest: set a destination address in a hardware descriptor + * @tx_set_src: set a source address in a hardware descriptor + * @callback: routine to call after this operation is complete + * @callback_param: general parameter to pass to the callback routine + * ---async_tx api specific fields--- + * @depend_list: at completion this list of transactions are submitted + * @depend_node: allow this transaction to be executed after another + * transaction has completed, possibly on another channel + * @parent: pointer to the next level up in the dependency chain + * @lock: protect the dependency list + */ +struct dma_async_tx_descriptor { + dma_cookie_t cookie; + int ack; + dma_addr_t phys; + struct list_head tx_list; + struct dma_chan *chan; + dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx); + void (*tx_set_dest)(dma_addr_t addr, + struct dma_async_tx_descriptor *tx, int index); + void (*tx_set_src)(dma_addr_t addr, + struct dma_async_tx_descriptor *tx, int index); + dma_async_tx_callback callback; + void *callback_param; + struct list_head depend_list; + struct list_head depend_node; + struct dma_async_tx_descriptor *parent; + spinlock_t lock; +}; + /** * struct dma_device - info on the entity supplying DMA services * @chancnt: how many DMA channels are supported * @channels: the list of struct dma_chan * @global_node: list_head for global dma_device_list + * @cap_mask: one or more dma_capability flags + * @max_xor: maximum number of xor sources, 0 if no capability * @refcount: reference count * @done: IO completion struct * @dev_id: unique device ID + * @dev: struct device reference for dma mapping api * @device_alloc_chan_resources: allocate resources and return the * number of allocated descriptors * @device_free_chan_resources: release DMA channel's resources - * @device_memcpy_buf_to_buf: memcpy buf pointer to buf pointer - * @device_memcpy_buf_to_pg: memcpy buf pointer to struct page - * @device_memcpy_pg_to_pg: memcpy struct page/offset to struct page/offset - * @device_memcpy_complete: poll the status of an IOAT DMA transaction - * @device_memcpy_issue_pending: push appended descriptors to hardware + * @device_prep_dma_memcpy: prepares a memcpy operation + * @device_prep_dma_xor: prepares a xor operation + * @device_prep_dma_zero_sum: prepares a zero_sum operation + * @device_prep_dma_memset: prepares a memset operation + * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_dependency_added: async_tx notifies the channel about new deps + * @device_issue_pending: push pending transactions to hardware */ struct dma_device { unsigned int chancnt; struct list_head channels; struct list_head global_node; + dma_cap_mask_t cap_mask; + int max_xor; struct kref refcount; struct completion done; int dev_id; + struct device *dev; int (*device_alloc_chan_resources)(struct dma_chan *chan); void (*device_free_chan_resources)(struct dma_chan *chan); - dma_cookie_t (*device_memcpy_buf_to_buf)(struct dma_chan *chan, - void *dest, void *src, size_t len); - dma_cookie_t (*device_memcpy_buf_to_pg)(struct dma_chan *chan, - struct page *page, unsigned int offset, void *kdata, - size_t len); - dma_cookie_t (*device_memcpy_pg_to_pg)(struct dma_chan *chan, - struct page *dest_pg, unsigned int dest_off, - struct page *src_pg, unsigned int src_off, size_t len); - enum dma_status (*device_memcpy_complete)(struct dma_chan *chan, + + struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)( + struct dma_chan *chan, size_t len, int int_en); + struct dma_async_tx_descriptor *(*device_prep_dma_xor)( + struct dma_chan *chan, unsigned int src_cnt, size_t len, + int int_en); + struct dma_async_tx_descriptor *(*device_prep_dma_zero_sum)( + struct dma_chan *chan, unsigned int src_cnt, size_t len, + u32 *result, int int_en); + struct dma_async_tx_descriptor *(*device_prep_dma_memset)( + struct dma_chan *chan, int value, size_t len, int int_en); + struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)( + struct dma_chan *chan); + + void (*device_dependency_added)(struct dma_chan *chan); + enum dma_status (*device_is_tx_complete)(struct dma_chan *chan, dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used); - void (*device_memcpy_issue_pending)(struct dma_chan *chan); + void (*device_issue_pending)(struct dma_chan *chan); }; /* --- public DMA engine API --- */ -struct dma_client *dma_async_client_register(dma_event_callback event_callback); +void dma_async_client_register(struct dma_client *client); void dma_async_client_unregister(struct dma_client *client); -void dma_async_client_chan_request(struct dma_client *client, - unsigned int number); - -/** - * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses - * @chan: DMA channel to offload copy to - * @dest: destination address (virtual) - * @src: source address (virtual) - * @len: length - * - * Both @dest and @src must be mappable to a bus address according to the - * DMA mapping API rules for streaming mappings. - * Both @dest and @src must stay memory resident (kernel memory or locked - * user space pages). - */ -static inline dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, - void *dest, void *src, size_t len) -{ - int cpu = get_cpu(); - per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; - per_cpu_ptr(chan->local, cpu)->memcpy_count++; - put_cpu(); - - return chan->device->device_memcpy_buf_to_buf(chan, dest, src, len); -} - -/** - * dma_async_memcpy_buf_to_pg - offloaded copy from address to page - * @chan: DMA channel to offload copy to - * @page: destination page - * @offset: offset in page to copy to - * @kdata: source address (virtual) - * @len: length - * - * Both @page/@offset and @kdata must be mappable to a bus address according - * to the DMA mapping API rules for streaming mappings. - * Both @page/@offset and @kdata must stay memory resident (kernel memory or - * locked user space pages) - */ -static inline dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, - struct page *page, unsigned int offset, void *kdata, size_t len) -{ - int cpu = get_cpu(); - per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; - per_cpu_ptr(chan->local, cpu)->memcpy_count++; - put_cpu(); - - return chan->device->device_memcpy_buf_to_pg(chan, page, offset, - kdata, len); -} - -/** - * dma_async_memcpy_pg_to_pg - offloaded copy from page to page - * @chan: DMA channel to offload copy to - * @dest_pg: destination page - * @dest_off: offset in page to copy to - * @src_pg: source page - * @src_off: offset in page to copy from - * @len: length - * - * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus - * address according to the DMA mapping API rules for streaming mappings. - * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident - * (kernel memory or locked user space pages). - */ -static inline dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan, +void dma_async_client_chan_request(struct dma_client *client); +dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, + void *dest, void *src, size_t len); +dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, + struct page *page, unsigned int offset, void *kdata, size_t len); +dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg, unsigned int dest_off, struct page *src_pg, - unsigned int src_off, size_t len) -{ - int cpu = get_cpu(); - per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; - per_cpu_ptr(chan->local, cpu)->memcpy_count++; - put_cpu(); + unsigned int src_off, size_t len); +void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, + struct dma_chan *chan); - return chan->device->device_memcpy_pg_to_pg(chan, dest_pg, dest_off, - src_pg, src_off, len); +static inline void +async_tx_ack(struct dma_async_tx_descriptor *tx) +{ + tx->ack = 1; } +#define first_dma_cap(mask) __first_dma_cap(&(mask)) +static inline int __first_dma_cap(const dma_cap_mask_t *srcp) +{ + return min_t(int, DMA_TX_TYPE_END, + find_first_bit(srcp->bits, DMA_TX_TYPE_END)); +} + +#define next_dma_cap(n, mask) __next_dma_cap((n), &(mask)) +static inline int __next_dma_cap(int n, const dma_cap_mask_t *srcp) +{ + return min_t(int, DMA_TX_TYPE_END, + find_next_bit(srcp->bits, DMA_TX_TYPE_END, n+1)); +} + +#define dma_cap_set(tx, mask) __dma_cap_set((tx), &(mask)) +static inline void +__dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp) +{ + set_bit(tx_type, dstp->bits); +} + +#define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask)) +static inline int +__dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp) +{ + return test_bit(tx_type, srcp->bits); +} + +#define for_each_dma_cap_mask(cap, mask) \ + for ((cap) = first_dma_cap(mask); \ + (cap) < DMA_TX_TYPE_END; \ + (cap) = next_dma_cap((cap), (mask))) + /** - * dma_async_memcpy_issue_pending - flush pending copies to HW + * dma_async_issue_pending - flush pending transactions to HW * @chan: target DMA channel * * This allows drivers to push copies to HW in batches, * reducing MMIO writes where possible. */ -static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan) +static inline void dma_async_issue_pending(struct dma_chan *chan) { - return chan->device->device_memcpy_issue_pending(chan); + return chan->device->device_issue_pending(chan); } +#define dma_async_memcpy_issue_pending(chan) dma_async_issue_pending(chan) + /** - * dma_async_memcpy_complete - poll for transaction completion + * dma_async_is_tx_complete - poll for transaction completion * @chan: DMA channel * @cookie: transaction identifier to check status of * @last: returns last completed cookie, can be NULL @@ -306,12 +376,15 @@ static inline void dma_async_memcpy_issue_pending(struct dma_chan *chan) * internal state and can be used with dma_async_is_complete() to check * the status of multiple cookies without re-checking hardware state. */ -static inline enum dma_status dma_async_memcpy_complete(struct dma_chan *chan, +static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan, dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used) { - return chan->device->device_memcpy_complete(chan, cookie, last, used); + return chan->device->device_is_tx_complete(chan, cookie, last, used); } +#define dma_async_memcpy_complete(chan, cookie, last, used)\ + dma_async_is_tx_complete(chan, cookie, last, used) + /** * dma_async_is_complete - test a cookie against chan state * @cookie: transaction identifier to test status of @@ -334,6 +407,7 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, return DMA_IN_PROGRESS; } +enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie); /* --- DMA device --- */ @@ -362,5 +436,4 @@ dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, struct dma_pinned_list *pinned_list, struct page *page, unsigned int offset, size_t len); -#endif /* CONFIG_DMA_ENGINE */ #endif /* DMAENGINE_H */ diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 9366182fffa7..2c7add169539 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -479,6 +479,9 @@ #define PCI_DEVICE_ID_IBM_ICOM_V2_ONE_PORT_RVX_ONE_PORT_MDM_PCIE 0x0361 #define PCI_DEVICE_ID_IBM_ICOM_FOUR_PORT_MODEL 0x252 +#define PCI_VENDOR_ID_UNISYS 0x1018 +#define PCI_DEVICE_ID_UNISYS_DMA_DIRECTOR 0x001C + #define PCI_VENDOR_ID_COMPEX2 0x101a /* pci.ids says "AT&T GIS (NCR)" */ #define PCI_DEVICE_ID_COMPEX2_100VG 0x0005 diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index d8286db60b96..93678f57ccbe 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -116,13 +116,46 @@ * attach a request to an active stripe (add_stripe_bh()) * lockdev attach-buffer unlockdev * handle a stripe (handle_stripe()) - * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io + * lockstripe clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed unlockstripe schedule io/ops * release an active stripe (release_stripe()) * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev * * The refcount counts each thread that have activated the stripe, * plus raid5d if it is handling it, plus one for each active request - * on a cached buffer. + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * Stripe operations are performed outside the stripe lock, + * the stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. */ struct stripe_head { @@ -136,15 +169,46 @@ struct stripe_head { spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ int disks; /* disks in stripe */ + /* stripe_operations + * @pending - pending ops flags (set for request->issue->complete) + * @ack - submitted ops flags (set for issue->complete) + * @complete - completed ops flags (set for complete) + * @target - STRIPE_OP_COMPUTE_BLK target + * @count - raid5_runs_ops is set to run when this is non-zero + */ + struct stripe_operations { + unsigned long pending; + unsigned long ack; + unsigned long complete; + int target; + int count; + u32 zero_sum_result; + } ops; struct r5dev { struct bio req; struct bio_vec vec; struct page *page; - struct bio *toread, *towrite, *written; + struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; } dev[1]; /* allocated with extra space depending of RAID geometry */ }; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. It is only valid under spin_lock(sh->lock); + */ +struct stripe_head_state { + int syncing, expanding, expanded; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int failed_num; +}; + +/* r6_state - extra state data only relevant to r6 */ +struct r6_state { + int p_failed, q_failed, qd_idx, failed_num[2]; +}; + /* Flags */ #define R5_UPTODATE 0 /* page contains current data */ #define R5_LOCKED 1 /* IO has been submitted on "req" */ @@ -158,6 +222,15 @@ struct stripe_head { #define R5_ReWrite 9 /* have tried to over-write the readerror */ #define R5_Expanded 10 /* This block now has post-expand data */ +#define R5_Wantcompute 11 /* compute_block in progress treat as + * uptodate + */ +#define R5_Wantfill 12 /* dev->toread contains a bio that needs + * filling + */ +#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from + * other "towrites" + */ /* * Write method */ @@ -179,6 +252,24 @@ struct stripe_head { #define STRIPE_EXPANDING 9 #define STRIPE_EXPAND_SOURCE 10 #define STRIPE_EXPAND_READY 11 +/* + * Operations flags (in issue order) + */ +#define STRIPE_OP_BIOFILL 0 +#define STRIPE_OP_COMPUTE_BLK 1 +#define STRIPE_OP_PREXOR 2 +#define STRIPE_OP_BIODRAIN 3 +#define STRIPE_OP_POSTXOR 4 +#define STRIPE_OP_CHECK 5 +#define STRIPE_OP_IO 6 + +/* modifiers to the base operations + * STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back + * STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check + */ +#define STRIPE_OP_MOD_REPAIR_PD 7 +#define STRIPE_OP_MOD_DMA_CHECK 8 + /* * Plugging: * diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h index f0d67cbdea40..3e120587eada 100644 --- a/include/linux/raid/xor.h +++ b/include/linux/raid/xor.h @@ -3,9 +3,10 @@ #include -#define MAX_XOR_BLOCKS 5 +#define MAX_XOR_BLOCKS 4 -extern void xor_block(unsigned int count, unsigned int bytes, void **ptr); +extern void xor_blocks(unsigned int count, unsigned int bytes, + void *dest, void **srcs); struct xor_block_template { struct xor_block_template *next; diff --git a/net/core/dev.c b/net/core/dev.c index 4221dcda88d7..96443055324e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -151,9 +151,22 @@ static struct list_head ptype_base[16] __read_mostly; /* 16 way hashed list */ static struct list_head ptype_all __read_mostly; /* Taps */ #ifdef CONFIG_NET_DMA -static struct dma_client *net_dma_client; -static unsigned int net_dma_count; -static spinlock_t net_dma_event_lock; +struct net_dma { + struct dma_client client; + spinlock_t lock; + cpumask_t channel_mask; + struct dma_chan *channels[NR_CPUS]; +}; + +static enum dma_state_client +netdev_dma_event(struct dma_client *client, struct dma_chan *chan, + enum dma_state state); + +static struct net_dma net_dma = { + .client = { + .event_callback = netdev_dma_event, + }, +}; #endif /* @@ -2022,12 +2035,13 @@ out: * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */ - if (net_dma_client) { - struct dma_chan *chan; - rcu_read_lock(); - list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node) - dma_async_memcpy_issue_pending(chan); - rcu_read_unlock(); + if (!cpus_empty(net_dma.channel_mask)) { + int chan_idx; + for_each_cpu_mask(chan_idx, net_dma.channel_mask) { + struct dma_chan *chan = net_dma.channels[chan_idx]; + if (chan) + dma_async_memcpy_issue_pending(chan); + } } #endif return; @@ -3775,12 +3789,13 @@ static int dev_cpu_callback(struct notifier_block *nfb, * This is called when the number of channels allocated to the net_dma_client * changes. The net_dma_client tries to have one DMA channel per CPU. */ -static void net_dma_rebalance(void) + +static void net_dma_rebalance(struct net_dma *net_dma) { - unsigned int cpu, i, n; + unsigned int cpu, i, n, chan_idx; struct dma_chan *chan; - if (net_dma_count == 0) { + if (cpus_empty(net_dma->channel_mask)) { for_each_online_cpu(cpu) rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL); return; @@ -3789,10 +3804,12 @@ static void net_dma_rebalance(void) i = 0; cpu = first_cpu(cpu_online_map); - rcu_read_lock(); - list_for_each_entry(chan, &net_dma_client->channels, client_node) { - n = ((num_online_cpus() / net_dma_count) - + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); + for_each_cpu_mask(chan_idx, net_dma->channel_mask) { + chan = net_dma->channels[chan_idx]; + + n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask)) + + (i < (num_online_cpus() % + cpus_weight(net_dma->channel_mask)) ? 1 : 0)); while(n) { per_cpu(softnet_data, cpu).net_dma = chan; @@ -3801,7 +3818,6 @@ static void net_dma_rebalance(void) } i++; } - rcu_read_unlock(); } /** @@ -3810,23 +3826,53 @@ static void net_dma_rebalance(void) * @chan: DMA channel for the event * @event: event type */ -static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, - enum dma_event event) +static enum dma_state_client +netdev_dma_event(struct dma_client *client, struct dma_chan *chan, + enum dma_state state) { - spin_lock(&net_dma_event_lock); - switch (event) { - case DMA_RESOURCE_ADDED: - net_dma_count++; - net_dma_rebalance(); + int i, found = 0, pos = -1; + struct net_dma *net_dma = + container_of(client, struct net_dma, client); + enum dma_state_client ack = DMA_DUP; /* default: take no action */ + + spin_lock(&net_dma->lock); + switch (state) { + case DMA_RESOURCE_AVAILABLE: + for (i = 0; i < NR_CPUS; i++) + if (net_dma->channels[i] == chan) { + found = 1; + break; + } else if (net_dma->channels[i] == NULL && pos < 0) + pos = i; + + if (!found && pos >= 0) { + ack = DMA_ACK; + net_dma->channels[pos] = chan; + cpu_set(pos, net_dma->channel_mask); + net_dma_rebalance(net_dma); + } break; case DMA_RESOURCE_REMOVED: - net_dma_count--; - net_dma_rebalance(); + for (i = 0; i < NR_CPUS; i++) + if (net_dma->channels[i] == chan) { + found = 1; + pos = i; + break; + } + + if (found) { + ack = DMA_ACK; + cpu_clear(pos, net_dma->channel_mask); + net_dma->channels[i] = NULL; + net_dma_rebalance(net_dma); + } break; default: break; } - spin_unlock(&net_dma_event_lock); + spin_unlock(&net_dma->lock); + + return ack; } /** @@ -3834,12 +3880,10 @@ static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, */ static int __init netdev_dma_register(void) { - spin_lock_init(&net_dma_event_lock); - net_dma_client = dma_async_client_register(netdev_dma_event); - if (net_dma_client == NULL) - return -ENOMEM; - - dma_async_client_chan_request(net_dma_client, num_online_cpus()); + spin_lock_init(&net_dma.lock); + dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask); + dma_async_client_register(&net_dma.client); + dma_async_client_chan_request(&net_dma.client); return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 450f44bb2c8e..987b94403be5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1116,6 +1116,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; struct task_struct *user_recv = NULL; int copied_early = 0; + struct sk_buff *skb; lock_sock(sk); @@ -1142,16 +1143,26 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, #ifdef CONFIG_NET_DMA tp->ucopy.dma_chan = NULL; preempt_disable(); - if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); - tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); - } else - preempt_enable_no_resched(); + skb = skb_peek_tail(&sk->sk_receive_queue); + { + int available = 0; + + if (skb) + available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); + if ((available < target) && + (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && + __get_cpu_var(softnet_data).net_dma) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = + dma_pin_iovec_pages(msg->msg_iov, len); + } else { + preempt_enable_no_resched(); + } + } #endif do { - struct sk_buff *skb; u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ @@ -1439,7 +1450,6 @@ skip_copy: #ifdef CONFIG_NET_DMA if (tp->ucopy.dma_chan) { - struct sk_buff *skb; dma_cookie_t done, used; dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);