Merge branch 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.33' of git://git.kernel.dk/linux-2.6-block: (113 commits)
  cfq-iosched: Do not access cfqq after freeing it
  block: include linux/err.h to use ERR_PTR
  cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit
  blkio: Allow CFQ group IO scheduling even when CFQ is a module
  blkio: Implement dynamic io controlling policy registration
  blkio: Export some symbols from blkio as its user CFQ can be a module
  block: Fix io_context leak after failure of clone with CLONE_IO
  block: Fix io_context leak after clone with CLONE_IO
  cfq-iosched: make nonrot check logic consistent
  io controller: quick fix for blk-cgroup and modular CFQ
  cfq-iosched: move IO controller declerations to a header file
  cfq-iosched: fix compile problem with !CONFIG_CGROUP
  blkio: Documentation
  blkio: Wait on sync-noidle queue even if rq_noidle = 1
  blkio: Implement group_isolation tunable
  blkio: Determine async workload length based on total number of queues
  blkio: Wait for cfq queue to get backlogged if group is empty
  blkio: Propagate cgroup weight updation to cfq groups
  blkio: Drop the reference to queue once the task changes cgroup
  blkio: Provide some isolation between groups
  ...
This commit is contained in:
Linus Torvalds 2009-12-08 08:19:16 -08:00
commit 6035ccd8e9
107 changed files with 24813 additions and 2132 deletions

View File

@ -0,0 +1,588 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
version="1.0"
width="210mm"
height="297mm"
viewBox="0 0 21000 29700"
id="svg2"
style="fill-rule:evenodd">
<defs
id="defs4" />
<g
id="Default"
style="visibility:visible">
<desc
id="desc180">Master slide</desc>
</g>
<path
d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
id="path193"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,7801 L 11999,8361"
id="path197"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
id="path209"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,9601 L 7999,10161"
id="path213"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
id="path225"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,7001 L 11764,7754"
id="path229"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
id="g245"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text247">
<tspan
x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
y="9284"
id="tspan249">RSDataReply</tspan>
</text>
</g>
<path
d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
id="path259"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,9001 L 8236,9565"
id="path263"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
id="g279"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text281">
<tspan
x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
y="7023"
id="tspan283">CsumRSRequest</tspan>
</text>
</g>
<text
id="text297"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
y="5707"
id="tspan299">w_make_resync_request()</tspan>
</text>
<text
id="text313"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
y="7806"
id="tspan315">receive_DataRequest()</tspan>
</text>
<text
id="text329"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
y="8606"
id="tspan331">drbd_endio_read_sec()</tspan>
</text>
<text
id="text345"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
y="9007"
id="tspan347">w_e_end_csum_rs_req()</tspan>
</text>
<text
id="text361"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
y="9507"
id="tspan363">receive_RSDataReply()</tspan>
</text>
<text
id="text377"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
y="10407"
id="tspan379">drbd_endio_write_sec()</tspan>
</text>
<text
id="text393"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
y="10907"
id="tspan395">e_end_resync_block()</tspan>
</text>
<path
d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
id="path405"
style="fill:#000080;visibility:visible" />
<path
d="M 7999,10801 L 11764,11554"
id="path409"
style="fill:none;stroke:#000080;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
id="g425"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<text
id="text427">
<tspan
x="9320 9621 9726 9798 9887 10065 10277 10438"
y="10943"
id="tspan429">WriteAck</tspan>
</text>
</g>
<text
id="text443"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
y="11559"
id="tspan445">got_BlockAck()</tspan>
</text>
<text
id="text459"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
y="4877"
id="tspan461">Checksum based Resync, case not in sync</tspan>
</text>
<text
id="text475"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
y="2806"
id="tspan477">DRBD-8.3 data flow</tspan>
</text>
<text
id="text491"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
y="7005"
id="tspan493">w_e_send_csum()</tspan>
</text>
<path
d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
id="path503"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,16801 L 11999,17361"
id="path507"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
id="path519"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,16001 L 11764,16754"
id="path523"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
id="g539"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<text
id="text541">
<tspan
x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
y="18265"
id="tspan543">RSIsInSync</tspan>
</text>
</g>
<path
d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
id="path553"
style="fill:#000080;visibility:visible" />
<path
d="M 11999,18001 L 8236,18565"
id="path557"
style="fill:none;stroke:#000080;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
id="g573"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text575">
<tspan
x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
y="16023"
id="tspan577">CsumRSRequest</tspan>
</text>
</g>
<text
id="text591"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
y="16806"
id="tspan593">receive_DataRequest()</tspan>
</text>
<text
id="text607"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
y="17606"
id="tspan609">drbd_endio_read_sec()</tspan>
</text>
<text
id="text623"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
y="18007"
id="tspan625">w_e_end_csum_rs_req()</tspan>
</text>
<text
id="text639"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
y="18507"
id="tspan641">got_IsInSync()</tspan>
</text>
<text
id="text655"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
y="13877"
id="tspan657">Checksum based Resync, case in sync</tspan>
</text>
<path
d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
id="path667"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,23801 L 12000,24361"
id="path671"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
id="path683"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,25601 L 8000,26161"
id="path687"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
id="path699"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,23001 L 11765,23754"
id="path703"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
id="g719"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text721">
<tspan
x="9464 9710 9921 10150 10328 10505 10577"
y="25236"
id="tspan723">OVReply</tspan>
</text>
</g>
<path
d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
id="path733"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,25001 L 8237,25565"
id="path737"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
id="g753"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text755">
<tspan
x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
y="23106"
id="tspan757">OVRequest</tspan>
</text>
</g>
<text
id="text771"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
y="23806"
id="tspan773">receive_OVRequest()</tspan>
</text>
<text
id="text787"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
y="24606"
id="tspan789">drbd_endio_read_sec()</tspan>
</text>
<text
id="text803"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
y="25007"
id="tspan805">w_e_end_ov_req()</tspan>
</text>
<text
id="text819"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
y="25507"
id="tspan821">receive_OVReply()</tspan>
</text>
<text
id="text835"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
y="26407"
id="tspan837">drbd_endio_read_sec()</tspan>
</text>
<text
id="text851"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
y="26907"
id="tspan853">w_e_end_ov_reply()</tspan>
</text>
<path
d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
id="path863"
style="fill:#000080;visibility:visible" />
<path
d="M 8000,26801 L 11765,27554"
id="path867"
style="fill:none;stroke:#000080;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
id="g883"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<text
id="text885">
<tspan
x="9279 9525 9736 9965 10143 10303 10481 10553"
y="26935"
id="tspan887">OVResult</tspan>
</text>
</g>
<text
id="text901"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
y="27559"
id="tspan903">got_OVResult()</tspan>
</text>
<text
id="text917"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
y="21877"
id="tspan919">Online verify</tspan>
</text>
<text
id="text933"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
y="23005"
id="tspan935">w_make_ov_request()</tspan>
</text>
<path
d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
id="path945"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,5700 L 8000,6260"
id="path949"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
id="path961"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
id="path973"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
id="path985"
style="fill:none;stroke:#000000;visibility:visible" />
<text
id="text1001"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
y="6506"
id="tspan1003">drbd_endio_read_sec()</tspan>
</text>
<text
id="text1017"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
y="14708"
id="tspan1019">w_make_resync_request()</tspan>
</text>
<text
id="text1033"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
y="16006"
id="tspan1035">w_e_send_csum()</tspan>
</text>
<path
d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
id="path1045"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,14701 L 8000,15261"
id="path1049"
style="fill:none;stroke:#008000;visibility:visible" />
<text
id="text1065"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
y="15507"
id="tspan1067">drbd_endio_read_sec()</tspan>
</text>
<path
d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
id="path1077"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
id="path1089"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
id="path1101"
style="fill:none;stroke:#000000;visibility:visible" />
<text
id="text1117"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
y="5402"
id="tspan1119">rs_begin_io()</tspan>
</text>
<text
id="text1133"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
y="14402"
id="tspan1135">rs_begin_io()</tspan>
</text>
<text
id="text1149"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
y="22602"
id="tspan1151">rs_begin_io()</tspan>
</text>
<text
id="text1165"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
y="11302"
id="tspan1167">rs_complete_io()</tspan>
</text>
<text
id="text1181"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
y="18931"
id="tspan1183">rs_complete_io()</tspan>
</text>
<text
id="text1197"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
y="27231"
id="tspan1199">rs_complete_io()</tspan>
</text>
<text
id="text1213"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
y="7402"
id="tspan1215">rs_begin_io()</tspan>
</text>
<text
id="text1229"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
y="16331"
id="tspan1231">rs_begin_io()</tspan>
</text>
<text
id="text1245"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
y="23302"
id="tspan1247">rs_begin_io()</tspan>
</text>
<text
id="text1261"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
y="9302"
id="tspan1263">rs_complete_io()</tspan>
</text>
<text
id="text1277"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
y="18331"
id="tspan1279">rs_complete_io()</tspan>
</text>
<text
id="text1293"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
y="25302"
id="tspan1295">rs_complete_io()</tspan>
</text>
</svg>

After

Width:  |  Height:  |  Size: 22 KiB

View File

@ -0,0 +1,459 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
version="1.0"
width="210mm"
height="297mm"
viewBox="0 0 21000 29700"
id="svg2"
style="fill-rule:evenodd">
<defs
id="defs4" />
<g
id="Default"
style="visibility:visible">
<desc
id="desc176">Master slide</desc>
</g>
<path
d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
id="path189"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,18801 L 11999,19361"
id="path193"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
id="path205"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,20601 L 7999,21161"
id="path209"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
id="path221"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,18001 L 11764,18754"
id="path225"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="-3023.845"
y="1106.8124"
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
id="text243"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
y="21390.812"
id="tspan245">RSDataReply</tspan>
</text>
<path
d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
id="path255"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,20001 L 8236,20565"
id="path259"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="3502.5356"
y="-2184.6621"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text277"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
y="15854.338"
id="tspan279">RSDataRequest</tspan>
</text>
<text
id="text293"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
y="17807"
id="tspan295">w_make_resync_request()</tspan>
</text>
<text
id="text309"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
y="18806"
id="tspan311">receive_DataRequest()</tspan>
</text>
<text
id="text325"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
y="19606"
id="tspan327">drbd_endio_read_sec()</tspan>
</text>
<text
id="text341"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
y="20007"
id="tspan343">w_e_end_rsdata_req()</tspan>
</text>
<text
id="text357"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
y="20507"
id="tspan359">receive_RSDataReply()</tspan>
</text>
<text
id="text373"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
y="21407"
id="tspan375">drbd_endio_write_sec()</tspan>
</text>
<text
id="text389"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
y="21907"
id="tspan391">e_end_resync_block()</tspan>
</text>
<path
d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
id="path401"
style="fill:#000080;visibility:visible" />
<path
d="M 7999,21801 L 11764,22554"
id="path405"
style="fill:none;stroke:#000080;visibility:visible" />
<text
x="4290.3008"
y="-2369.6162"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text423"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
y="19573.385"
id="tspan425">WriteAck</tspan>
</text>
<text
id="text439"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
y="22559"
id="tspan441">got_BlockAck()</tspan>
</text>
<text
id="text455"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
y="16877"
id="tspan457">Resync blocks, 4-32K</tspan>
</text>
<path
d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
id="path467"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,6801 L 12000,7361"
id="path471"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
id="path483"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,6001 L 11765,6754"
id="path487"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="-1288.1796"
y="1279.7666"
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
id="text505"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
y="9516.7666"
id="tspan507">WriteAck</tspan>
</text>
<path
d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
id="path517"
style="fill:#000080;visibility:visible" />
<path
d="M 12000,8001 L 8237,8565"
id="path521"
style="fill:none;stroke:#000080;visibility:visible" />
<text
x="1065.6655"
y="-2097.7664"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text539"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="10682.666 10911.666 11088.666 11177.666"
y="4107.2339"
id="tspan541">Data</tspan>
</text>
<text
id="text555"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
y="5505"
id="tspan557">drbd_make_request()</tspan>
</text>
<text
id="text571"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
y="6806"
id="tspan573">receive_Data()</tspan>
</text>
<text
id="text587"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
y="7606"
id="tspan589">drbd_endio_write_sec()</tspan>
</text>
<text
id="text603"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
y="8007"
id="tspan605">e_end_block()</tspan>
</text>
<text
id="text619"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
y="8606"
id="tspan621">got_BlockAck()</tspan>
</text>
<text
id="text635"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
y="4877"
id="tspan637">Regular mirrored write, 512-32K</tspan>
</text>
<text
id="text651"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
y="6003"
id="tspan653">w_send_dblock()</tspan>
</text>
<path
d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
id="path663"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,6000 L 8000,6560"
id="path667"
style="fill:none;stroke:#008000;visibility:visible" />
<text
id="text683"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
y="6905"
id="tspan685">drbd_endio_write_pri()</tspan>
</text>
<path
d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
id="path695"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,12802 L 12000,13362"
id="path699"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
id="path711"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,12002 L 11765,12755"
id="path715"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="-2155.5266"
y="1201.5964"
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
id="text733"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
y="15454.597"
id="tspan735">DataReply</tspan>
</text>
<path
d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
id="path745"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,14002 L 8237,14566"
id="path749"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="2280.3804"
y="-2103.2141"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text767"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
y="9981.7861"
id="tspan769">DataRequest</tspan>
</text>
<text
id="text783"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
y="11506"
id="tspan785">drbd_make_request()</tspan>
</text>
<text
id="text799"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
y="12807"
id="tspan801">receive_DataRequest()</tspan>
</text>
<text
id="text815"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
y="13607"
id="tspan817">drbd_endio_read_sec()</tspan>
</text>
<text
id="text831"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
y="14008"
id="tspan833">w_e_end_data_req()</tspan>
</text>
<g
id="g835"
style="visibility:visible">
<desc
id="desc837">Drawing</desc>
<text
id="text847"
style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
<tspan
x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
y="14607"
id="tspan849">receive_DataReply()</tspan>
</text>
</g>
<text
id="text863"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
y="10878"
id="tspan865">Diskless read, 512-32K</tspan>
</text>
<text
id="text879"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
y="12004"
id="tspan881">w_send_read_req()</tspan>
</text>
<text
id="text895"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
y="2806"
id="tspan897">DRBD 8 data flow</tspan>
</text>
<path
d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
id="path907"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
id="path919"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
id="path931"
style="fill:none;stroke:#000000;visibility:visible" />
<text
id="text947"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
y="5202"
id="tspan949">al_begin_io()</tspan>
</text>
<text
id="text963"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
y="7331"
id="tspan965">al_complete_io()</tspan>
</text>
<text
id="text979"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
y="17431"
id="tspan981">rs_begin_io()</tspan>
</text>
<text
id="text995"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
y="22331"
id="tspan997">rs_complete_io()</tspan>
</text>
<text
id="text1011"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
y="18402"
id="tspan1013">rs_begin_io()</tspan>
</text>
<text
id="text1027"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
y="20331"
id="tspan1029">rs_complete_io()</tspan>
</text>
</svg>

After

Width:  |  Height:  |  Size: 17 KiB

View File

@ -0,0 +1,16 @@
Description
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Please visit http://www.drbd.org to find out more.
The here included files are intended to help understand the implementation
DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
relates some functions, and write packets.
conn-states-8.dot, disk-states-8.dot, node-states-8.dot
The sub graphs of DRBD's state transitions

View File

@ -0,0 +1,18 @@
digraph conn_states {
StandAllone -> WFConnection [ label = "ioctl_set_net()" ]
WFConnection -> Unconnected [ label = "unable to bind()" ]
WFConnection -> WFReportParams [ label = "in connect() after accept" ]
WFReportParams -> StandAllone [ label = "checks in receive_param()" ]
WFReportParams -> Connected [ label = "in receive_param()" ]
WFReportParams -> WFBitMapS [ label = "sync_handshake()" ]
WFReportParams -> WFBitMapT [ label = "sync_handshake()" ]
WFBitMapS -> SyncSource [ label = "receive_bitmap()" ]
WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ]
SyncSource -> Connected
SyncTarget -> Connected
SyncSource -> PausedSyncS
SyncTarget -> PausedSyncT
PausedSyncS -> SyncSource
PausedSyncT -> SyncTarget
Connected -> WFConnection [ label = "* on network error" ]
}

View File

@ -0,0 +1,16 @@
digraph disk_states {
Diskless -> Inconsistent [ label = "ioctl_set_disk()" ]
Diskless -> Consistent [ label = "ioctl_set_disk()" ]
Diskless -> Outdated [ label = "ioctl_set_disk()" ]
Consistent -> Outdated [ label = "receive_param()" ]
Consistent -> UpToDate [ label = "receive_param()" ]
Consistent -> Inconsistent [ label = "start resync" ]
Outdated -> Inconsistent [ label = "start resync" ]
UpToDate -> Inconsistent [ label = "ioctl_replicate" ]
Inconsistent -> UpToDate [ label = "resync completed" ]
Consistent -> Failed [ label = "io completion error" ]
Outdated -> Failed [ label = "io completion error" ]
UpToDate -> Failed [ label = "io completion error" ]
Inconsistent -> Failed [ label = "io completion error" ]
Failed -> Diskless [ label = "sending notify to peer" ]
}

View File

@ -0,0 +1,85 @@
// vim: set sw=2 sts=2 :
digraph {
rankdir=BT
bgcolor=white
node [shape=plaintext]
node [fontcolor=black]
StandAlone [ style=filled,fillcolor=gray,label=StandAlone ]
node [fontcolor=lightgray]
Unconnected [ label=Unconnected ]
CommTrouble [ shape=record,
label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
node [fontcolor=gray]
subgraph cluster_try_connect {
label="try to connect, handshake"
rank=max
WFConnection [ label=WFConnection ]
WFReportParams [ label=WFReportParams ]
}
TearDown [ label=TearDown ]
Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
node [fontcolor=lightblue]
StartingSyncS [ label=StartingSyncS ]
StartingSyncT [ label=StartingSyncT ]
subgraph cluster_bitmap_exchange {
node [fontcolor=red]
fontcolor=red
label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
WFBitMapT [ label=WFBitMapT ]
WFSyncUUID [ label=WFSyncUUID ]
WFBitMapS [ label=WFBitMapS ]
}
node [fontcolor=blue]
cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
node [shape=box,fontcolor=black]
// drbdadm [label="drbdadm connect"]
// handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
// comm_error [label="communication trouble"]
//
// edges
// --------------------------------------
StandAlone -> Unconnected [ label="drbdadm connect" ]
Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ]
Unconnected -> WFConnection [ label="receiver thread is started" ]
WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ]
WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
WFReportParams -> WFBitMapS
WFReportParams -> WFBitMapT
WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
WFBitMapS -> cluster_resync:S
WFSyncUUID -> cluster_resync:T
edge [color=green]
cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
edge [color=red]
WFReportParams -> CommTrouble
Connected -> CommTrouble
cluster_resync:any -> CommTrouble
edge [color=black]
CommTrouble -> Unconnected [label="receiver thread is stopped" ]
}

View File

@ -0,0 +1,14 @@
digraph node_states {
Secondary -> Primary [ label = "ioctl_set_state()" ]
Primary -> Secondary [ label = "ioctl_set_state()" ]
}
digraph peer_states {
Secondary -> Primary [ label = "recv state packet" ]
Primary -> Secondary [ label = "recv state packet" ]
Primary -> Unknown [ label = "connection lost" ]
Secondary -> Unknown [ label = "connection lost" ]
Unknown -> Primary [ label = "connected" ]
Unknown -> Secondary [ label = "connected" ]
}

View File

@ -0,0 +1,135 @@
Block IO Controller
===================
Overview
========
cgroup subsys "blkio" implements the block io controller. There seems to be
a need of various kinds of IO control policies (like proportional BW, max BW)
both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
Plan is to use the same cgroup based management interface for blkio controller
and based on user options switch IO policies in the background.
In the first phase, this patchset implements proportional weight time based
division of disk policy. It is implemented in CFQ. Hence this policy takes
effect only on leaf nodes when CFQ is being used.
HOWTO
=====
You can do a very simple testing of running two dd threads in two different
cgroups. Here is what you can do.
- Enable group scheduling in CFQ
CONFIG_CFQ_GROUP_IOSCHED=y
- Compile and boot into kernel and mount IO controller (blkio).
mount -t cgroup -o blkio none /cgroup
- Create two cgroups
mkdir -p /cgroup/test1/ /cgroup/test2
- Set weights of group test1 and test2
echo 1000 > /cgroup/test1/blkio.weight
echo 500 > /cgroup/test2/blkio.weight
- Create two same size files (say 512MB each) on same disk (file1, file2) and
launch two dd threads in different cgroup to read those files.
sync
echo 3 > /proc/sys/vm/drop_caches
dd if=/mnt/sdb/zerofile1 of=/dev/null &
echo $! > /cgroup/test1/tasks
cat /cgroup/test1/tasks
dd if=/mnt/sdb/zerofile2 of=/dev/null &
echo $! > /cgroup/test2/tasks
cat /cgroup/test2/tasks
- At macro level, first dd should finish first. To get more precise data, keep
on looking at (with the help of script), at blkio.disk_time and
blkio.disk_sectors files of both test1 and test2 groups. This will tell how
much disk time (in milli seconds), each group got and how many secotors each
group dispatched to the disk. We provide fairness in terms of disk time, so
ideally io.disk_time of cgroups should be in proportion to the weight.
Various user visible config options
===================================
CONFIG_CFQ_GROUP_IOSCHED
- Enables group scheduling in CFQ. Currently only 1 level of group
creation is allowed.
CONFIG_DEBUG_CFQ_IOSCHED
- Enables some debugging messages in blktrace. Also creates extra
cgroup file blkio.dequeue.
Config options selected automatically
=====================================
These config options are not user visible and are selected/deselected
automatically based on IO scheduler configuration.
CONFIG_BLK_CGROUP
- Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
CONFIG_DEBUG_BLK_CGROUP
- Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
Details of cgroup files
=======================
- blkio.weight
- Specifies per cgroup weight.
Currently allowed range of weights is from 100 to 1000.
- blkio.time
- disk time allocated to cgroup per device in milliseconds. First
two fields specify the major and minor number of the device and
third field specifies the disk time allocated to group in
milliseconds.
- blkio.sectors
- number of sectors transferred to/from disk by the group. First
two fields specify the major and minor number of the device and
third field specifies the number of sectors transferred by the
group to/from the device.
- blkio.dequeue
- Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
gives the statistics about how many a times a group was dequeued
from service tree of the device. First two fields specify the major
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.
CFQ sysfs tunable
=================
/sys/block/<disk>/queue/iosched/group_isolation
If group_isolation=1, it provides stronger isolation between groups at the
expense of throughput. By default group_isolation is 0. In general that
means that if group_isolation=0, expect fairness for sequential workload
only. Set group_isolation=1 to see fairness for random IO workload also.
Generally CFQ will put random seeky workload in sync-noidle category. CFQ
will disable idling on these queues and it does a collective idling on group
of such queues. Generally these are slow moving queues and if there is a
sync-noidle service tree in each group, that group gets exclusive access to
disk for certain period. That means it will bring the throughput down if
group does not have enough IO to drive deeper queue depths and utilize disk
capacity to the fullest in the slice allocated to it. But the flip side is
that even a random reader should get better latencies and overall throughput
if there are lots of sequential readers/sync-idle workload running in the
system.
If group_isolation=0, then CFQ automatically moves all the random seeky queues
in the root group. That means there will be no service differentiation for
that kind of workload. This leads to better throughput as we do collective
idling on root sync-noidle tree.
By default one should run with group_isolation=0. If that is not sufficient
and one wants stronger isolation between groups, then set group_isolation=1
but this will come at cost of reduced throughput.
What works
==========
- Currently only sync IO queues are support. All the buffered writes are
still system wide and not per group. Hence we will not see service
differentiation between buffered writes between groups.

View File

@ -1848,6 +1848,19 @@ S: Maintained
F: drivers/scsi/dpt*
F: drivers/scsi/dpt/
DRBD DRIVER
P: Philipp Reisner
P: Lars Ellenberg
M: drbd-dev@lists.linbit.com
L: drbd-user@lists.linbit.com
W: http://www.drbd.org
T: git git://git.drbd.org/linux-2.6-drbd.git drbd
T: git git://git.drbd.org/drbd-8.3.git
S: Supported
F: drivers/block/drbd/
F: lib/lru_cache.c
F: Documentation/blockdev/drbd/
DRIVER CORE, KOBJECTS, AND SYSFS
M: Greg Kroah-Hartman <gregkh@suse.de>
T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/

View File

@ -9,6 +9,7 @@
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -418,6 +418,7 @@ extern void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
* about to change to user space. This is the same method as used on SPARC64.
* See update_mmu_cache for the user space part.
*/
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *);
static inline void __flush_icache_all(void)

View File

@ -107,6 +107,7 @@ extern void flush_icache_page(struct vm_area_struct *vma, struct page *page);
* do something here, but only for certain configurations. No such
* configurations exist at this time.
*/
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(page) do { } while (0)
#define flush_dcache_mmap_unlock(page) do { } while (0)

View File

@ -68,9 +68,11 @@ do { memcpy(dst, src, len); \
#endif
#if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK)
# define flush_dcache_range(start,end) blackfin_dcache_flush_range((start), (end))
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
# define flush_dcache_page(page) blackfin_dflush_page(page_address(page))
#else
# define flush_dcache_range(start,end) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
# define flush_dcache_page(page) do { } while (0)
#endif

View File

@ -12,6 +12,7 @@
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -47,6 +47,7 @@ static inline void __flush_cache_all(void)
}
/* dcache/icache coherency... */
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
#ifdef CONFIG_MMU
extern void flush_dcache_page(struct page *page);
#else

View File

@ -15,6 +15,7 @@
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma,a,b)
#define flush_cache_page(vma,p,pfn)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page)
#define flush_dcache_mmap_lock(mapping)
#define flush_dcache_mmap_unlock(mapping)

View File

@ -25,6 +25,7 @@
#define flush_cache_vmap(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
#define flush_dcache_page(page) \
do { \
clear_bit(PG_arch_1, &(page)->flags); \

View File

@ -12,6 +12,7 @@ extern void _flush_cache_copyback_all(void);
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
@ -33,6 +34,7 @@ extern void smp_flush_cache_all(void);
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
@ -46,6 +48,7 @@ extern void smp_flush_cache_all(void);
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -128,6 +128,7 @@ static inline void __flush_page_to_ram(void *vaddr)
}
}
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
#define flush_dcache_page(page) __flush_page_to_ram(page_address(page))
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -12,6 +12,7 @@
#define flush_cache_range(vma, start, end) __flush_cache_all()
#define flush_cache_page(vma, vmaddr) do { } while (0)
#define flush_dcache_range(start,len) __flush_cache_all()
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -37,6 +37,7 @@
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define flush_dcache_range(start, end) __invalidate_dcache_range(start, end)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -38,6 +38,7 @@ extern void (*flush_cache_range)(struct vm_area_struct *vma,
extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn);
extern void __flush_dcache_page(struct page *page);
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
static inline void flush_dcache_page(struct page *page)
{
if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc)

View File

@ -26,6 +26,7 @@
#define flush_cache_page(vma, vmaddr, pfn) do {} while (0)
#define flush_cache_vmap(start, end) do {} while (0)
#define flush_cache_vunmap(start, end) do {} while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do {} while (0)
#define flush_dcache_mmap_lock(mapping) do {} while (0)
#define flush_dcache_mmap_unlock(mapping) do {} while (0)

View File

@ -42,6 +42,7 @@ void flush_cache_mm(struct mm_struct *mm);
#define flush_cache_vmap(start, end) flush_cache_all()
#define flush_cache_vunmap(start, end) flush_cache_all()
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
#define flush_dcache_mmap_lock(mapping) \

View File

@ -25,6 +25,7 @@
#define flush_cache_vmap(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -10,6 +10,7 @@
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -16,6 +16,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end);
extern void flush_dcache_range(unsigned long start, unsigned long end);
#define flush_cache_dup_mm(mm) do {} while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do {} while (0)
#define flush_dcache_mmap_lock(mapping) do {} while (0)
#define flush_dcache_mmap_unlock(mapping) do {} while (0)

View File

@ -42,6 +42,7 @@ extern void flush_cache_page(struct vm_area_struct *vma,
unsigned long addr, unsigned long pfn);
extern void flush_cache_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
extern void flush_icache_range(unsigned long start, unsigned long end);
extern void flush_icache_page(struct vm_area_struct *vma,

View File

@ -75,6 +75,7 @@ BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long)
extern void sparc_flush_page_to_ram(struct page *page);
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
#define flush_dcache_page(page) sparc_flush_page_to_ram(page)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -37,6 +37,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page);
#endif
extern void __flush_dcache_range(unsigned long start, unsigned long end);
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *page);
#define flush_icache_page(vma, pg) do { } while(0)

View File

@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end) { }
static inline void flush_cache_page(struct vm_area_struct *vma,
unsigned long vmaddr, unsigned long pfn) { }
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
static inline void flush_dcache_page(struct page *page) { }
static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }

View File

@ -101,6 +101,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt,
#define flush_cache_vmap(start,end) flush_cache_all()
#define flush_cache_vunmap(start,end) flush_cache_all()
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page*);
extern void flush_cache_range(struct vm_area_struct*, ulong, ulong);
extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long);

View File

@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
config BLK_CGROUP
bool
depends on CGROUPS
default n
---help---
Generic block IO controller cgroup interface. This is the common
cgroup interface which should be used by various IO controlling
policies.
Currently, CFQ IO scheduler uses it to recognize task groups and
control disk bandwidth allocation (proportional time slice allocation)
to such task groups.
config DEBUG_BLK_CGROUP
bool
depends on BLK_CGROUP
default n
---help---
Enable some debugging help. Currently it stores the cgroup path
in the blk group which can be used by cfq for tracing various
group related activity.
endif # BLOCK
config BLOCK_COMPAT

View File

@ -12,24 +12,14 @@ config IOSCHED_NOOP
that do their own scheduling and require only minimal assistance from
the kernel.
config IOSCHED_AS
tristate "Anticipatory I/O scheduler"
default y
---help---
The anticipatory I/O scheduler is generally a good choice for most
environments, but is quite large and complex when compared to the
deadline I/O scheduler, it can also be slower in some cases
especially some database loads.
config IOSCHED_DEADLINE
tristate "Deadline I/O scheduler"
default y
---help---
The deadline I/O scheduler is simple and compact, and is often as
good as the anticipatory I/O scheduler, and in some database
workloads, better. In the case of a single process performing I/O to
a disk at any one time, its behaviour is almost identical to the
anticipatory I/O scheduler and so is a good choice.
The deadline I/O scheduler is simple and compact. It will provide
CSCAN service with FIFO expiration of requests, switching to
a new point in the service tree and doing a batch of IO from there
in case of expiry.
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
@ -37,9 +27,28 @@ config IOSCHED_CFQ
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
among all processes in the system. It should provide a fair
working environment, suitable for desktop systems.
and low latency working environment, suitable for both desktop
and server systems.
This is the default I/O scheduler.
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
depends on IOSCHED_CFQ && CGROUPS
select BLK_CGROUP
default n
---help---
Enable group IO scheduling in CFQ.
config DEBUG_CFQ_IOSCHED
bool "Debug CFQ Scheduling"
depends on CFQ_GROUP_IOSCHED
select DEBUG_BLK_CGROUP
default n
---help---
Enable CFQ IO scheduling debugging in CFQ. Currently it makes
blktrace output more verbose.
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
@ -47,9 +56,6 @@ choice
Select the I/O scheduler which will be used by default for all
block devices.
config DEFAULT_AS
bool "Anticipatory" if IOSCHED_AS=y
config DEFAULT_DEADLINE
bool "Deadline" if IOSCHED_DEADLINE=y
@ -63,7 +69,6 @@ endchoice
config DEFAULT_IOSCHED
string
default "anticipatory" if DEFAULT_AS
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
default "noop" if DEFAULT_NOOP

View File

@ -8,8 +8,8 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o

File diff suppressed because it is too large Load Diff

361
block/blk-cgroup.c Normal file
View File

@ -0,0 +1,361 @@
/*
* Common Block IO controller cgroup interface
*
* Based on ideas and code from CFQ, CFS and BFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
* Nauman Rafique <nauman@google.com>
*/
#include <linux/ioprio.h>
#include <linux/seq_file.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/err.h>
#include "blk-cgroup.h"
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);
bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
{
if (!css_tryget(&blkcg->css))
return false;
return true;
}
EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
void blkiocg_css_put(struct blkio_cgroup *blkcg)
{
css_put(&blkcg->css);
}
EXPORT_SYMBOL_GPL(blkiocg_css_put);
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
struct blkio_cgroup, css);
}
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
unsigned long time, unsigned long sectors)
{
blkg->time += time;
blkg->sectors += sectors;
}
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev)
{
unsigned long flags;
spin_lock_irqsave(&blkcg->lock, flags);
rcu_assign_pointer(blkg->key, key);
blkg->blkcg_id = css_id(&blkcg->css);
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
spin_unlock_irqrestore(&blkcg->lock, flags);
#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Need to take css reference ? */
cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
#endif
blkg->dev = dev;
}
EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
{
hlist_del_init_rcu(&blkg->blkcg_node);
blkg->blkcg_id = 0;
}
/*
* returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
* indicating that blk_group was unhashed by the time we got to it.
*/
int blkiocg_del_blkio_group(struct blkio_group *blkg)
{
struct blkio_cgroup *blkcg;
unsigned long flags;
struct cgroup_subsys_state *css;
int ret = 1;
rcu_read_lock();
css = css_lookup(&blkio_subsys, blkg->blkcg_id);
if (!css)
goto out;
blkcg = container_of(css, struct blkio_cgroup, css);
spin_lock_irqsave(&blkcg->lock, flags);
if (!hlist_unhashed(&blkg->blkcg_node)) {
__blkiocg_del_blkio_group(blkg);
ret = 0;
}
spin_unlock_irqrestore(&blkcg->lock, flags);
out:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
/* called under rcu_read_lock(). */
struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
{
struct blkio_group *blkg;
struct hlist_node *n;
void *__key;
hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
__key = blkg->key;
if (__key == key)
return blkg;
}
return NULL;
}
EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
#define SHOW_FUNCTION(__VAR) \
static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
struct cftype *cftype) \
{ \
struct blkio_cgroup *blkcg; \
\
blkcg = cgroup_to_blkio_cgroup(cgroup); \
return (u64)blkcg->__VAR; \
}
SHOW_FUNCTION(weight);
#undef SHOW_FUNCTION
static int
blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
{
struct blkio_cgroup *blkcg;
struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;
if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;
blkcg = cgroup_to_blkio_cgroup(cgroup);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
spin_lock(&blkio_list_lock);
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);
spin_unlock(&blkio_list_lock);
}
spin_unlock_irq(&blkcg->lock);
return 0;
}
#define SHOW_FUNCTION_PER_GROUP(__VAR) \
static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
struct cftype *cftype, struct seq_file *m) \
{ \
struct blkio_cgroup *blkcg; \
struct blkio_group *blkg; \
struct hlist_node *n; \
\
if (!cgroup_lock_live_group(cgroup)) \
return -ENODEV; \
\
blkcg = cgroup_to_blkio_cgroup(cgroup); \
rcu_read_lock(); \
hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
if (blkg->dev) \
seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
MINOR(blkg->dev), blkg->__VAR); \
} \
rcu_read_unlock(); \
cgroup_unlock(); \
return 0; \
}
SHOW_FUNCTION_PER_GROUP(time);
SHOW_FUNCTION_PER_GROUP(sectors);
#ifdef CONFIG_DEBUG_BLK_CGROUP
SHOW_FUNCTION_PER_GROUP(dequeue);
#endif
#undef SHOW_FUNCTION_PER_GROUP
#ifdef CONFIG_DEBUG_BLK_CGROUP
void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
unsigned long dequeue)
{
blkg->dequeue += dequeue;
}
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
#endif
struct cftype blkio_files[] = {
{
.name = "weight",
.read_u64 = blkiocg_weight_read,
.write_u64 = blkiocg_weight_write,
},
{
.name = "time",
.read_seq_string = blkiocg_time_read,
},
{
.name = "sectors",
.read_seq_string = blkiocg_sectors_read,
},
#ifdef CONFIG_DEBUG_BLK_CGROUP
{
.name = "dequeue",
.read_seq_string = blkiocg_dequeue_read,
},
#endif
};
static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
return cgroup_add_files(cgroup, subsys, blkio_files,
ARRAY_SIZE(blkio_files));
}
static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
unsigned long flags;
struct blkio_group *blkg;
void *key;
struct blkio_policy_type *blkiop;
rcu_read_lock();
remove_entry:
spin_lock_irqsave(&blkcg->lock, flags);
if (hlist_empty(&blkcg->blkg_list)) {
spin_unlock_irqrestore(&blkcg->lock, flags);
goto done;
}
blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
blkcg_node);
key = rcu_dereference(blkg->key);
__blkiocg_del_blkio_group(blkg);
spin_unlock_irqrestore(&blkcg->lock, flags);
/*
* This blkio_group is being unlinked as associated cgroup is going
* away. Let all the IO controlling policies know about this event.
*
* Currently this is static call to one io controlling policy. Once
* we have more policies in place, we need some dynamic registration
* of callback function.
*/
spin_lock(&blkio_list_lock);
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_unlink_group_fn(key, blkg);
spin_unlock(&blkio_list_lock);
goto remove_entry;
done:
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
kfree(blkcg);
}
static struct cgroup_subsys_state *
blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
{
struct blkio_cgroup *blkcg, *parent_blkcg;
if (!cgroup->parent) {
blkcg = &blkio_root_cgroup;
goto done;
}
/* Currently we do not support hierarchy deeper than two level (0,1) */
parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
if (css_depth(&parent_blkcg->css) > 0)
return ERR_PTR(-EINVAL);
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
return ERR_PTR(-ENOMEM);
blkcg->weight = BLKIO_WEIGHT_DEFAULT;
done:
spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);
return &blkcg->css;
}
/*
* We cannot support shared io contexts, as we have no mean to support
* two tasks with the same ioc in two different groups without major rework
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
static int blkiocg_can_attach(struct cgroup_subsys *subsys,
struct cgroup *cgroup, struct task_struct *tsk,
bool threadgroup)
{
struct io_context *ioc;
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
task_lock(tsk);
ioc = tsk->io_context;
if (ioc && atomic_read(&ioc->nr_tasks) > 1)
ret = -EINVAL;
task_unlock(tsk);
return ret;
}
static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
struct cgroup *prev, struct task_struct *tsk,
bool threadgroup)
{
struct io_context *ioc;
task_lock(tsk);
ioc = tsk->io_context;
if (ioc)
ioc->cgroup_changed = 1;
task_unlock(tsk);
}
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
.can_attach = blkiocg_can_attach,
.attach = blkiocg_attach,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
.subsys_id = blkio_subsys_id,
.use_id = 1,
};
void blkio_policy_register(struct blkio_policy_type *blkiop)
{
spin_lock(&blkio_list_lock);
list_add_tail(&blkiop->list, &blkio_list);
spin_unlock(&blkio_list_lock);
}
EXPORT_SYMBOL_GPL(blkio_policy_register);
void blkio_policy_unregister(struct blkio_policy_type *blkiop)
{
spin_lock(&blkio_list_lock);
list_del_init(&blkiop->list);
spin_unlock(&blkio_list_lock);
}
EXPORT_SYMBOL_GPL(blkio_policy_unregister);

127
block/blk-cgroup.h Normal file
View File

@ -0,0 +1,127 @@
#ifndef _BLK_CGROUP_H
#define _BLK_CGROUP_H
/*
* Common Block IO controller cgroup interface
*
* Based on ideas and code from CFQ, CFS and BFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
* Nauman Rafique <nauman@google.com>
*/
#include <linux/cgroup.h>
#ifdef CONFIG_BLK_CGROUP
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
};
struct blkio_group {
/* An rcu protected unique identifier for the group */
void *key;
struct hlist_node blkcg_node;
unsigned short blkcg_id;
#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Store cgroup path */
char path[128];
/* How many times this group has been removed from service tree */
unsigned long dequeue;
#endif
/* The device MKDEV(major, minor), this group has been created for */
dev_t dev;
/* total disk time and nr sectors dispatched by this group */
unsigned long time;
unsigned long sectors;
};
extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
unsigned int weight);
struct blkio_policy_ops {
blkio_unlink_group_fn *blkio_unlink_group_fn;
blkio_update_group_weight_fn *blkio_update_group_weight_fn;
};
struct blkio_policy_type {
struct list_head list;
struct blkio_policy_ops ops;
};
/* Blkio controller policy registration */
extern void blkio_policy_register(struct blkio_policy_type *);
extern void blkio_policy_unregister(struct blkio_policy_type *);
#else
struct blkio_group {
};
struct blkio_policy_type {
};
static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
#endif
#define BLKIO_WEIGHT_MIN 100
#define BLKIO_WEIGHT_MAX 1000
#define BLKIO_WEIGHT_DEFAULT 500
#ifdef CONFIG_DEBUG_BLK_CGROUP
static inline char *blkg_path(struct blkio_group *blkg)
{
return blkg->path;
}
void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
unsigned long dequeue);
#else
static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
static inline void blkiocg_update_blkio_group_dequeue_stats(
struct blkio_group *blkg, unsigned long dequeue) {}
#endif
#ifdef CONFIG_BLK_CGROUP
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev);
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
void *key);
void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
unsigned long time, unsigned long sectors);
#else
struct cgroup;
static inline struct blkio_cgroup *
cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev)
{
}
static inline int
blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
static inline struct blkio_group *
blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
unsigned long time, unsigned long sectors)
{
}
#endif
#endif /* _BLK_CGROUP_H */

View File

@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
rq->rq_disk = bio->bi_bdev->bd_disk;
}
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
* rq_flush_dcache_pages - Helper function to flush all pages in a request
* @rq: the request to be flushed
*
* Description:
* Flush all pages in @rq.
*/
void rq_flush_dcache_pages(struct request *rq)
{
struct req_iterator iter;
struct bio_vec *bvec;
rq_for_each_segment(bvec, rq, iter)
flush_dcache_page(bvec->bv_page);
}
EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
#endif
/**
* blk_lld_busy - Check if underlying low-level drivers of a device are busy
* @q : the queue of the device being checked

View File

@ -66,22 +66,22 @@ static void cfq_exit(struct io_context *ioc)
}
/* Called by the exitting task */
void exit_io_context(void)
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(current);
ioc = current->io_context;
current->io_context = NULL;
task_unlock(current);
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
if (atomic_dec_and_test(&ioc->nr_tasks)) {
if (ioc->aic && ioc->aic->exit)
ioc->aic->exit(ioc->aic);
cfq_exit(ioc);
put_io_context(ioc);
}
put_io_context(ioc);
}
struct io_context *alloc_io_context(gfp_t gfp_flags, int node)

View File

@ -8,6 +8,7 @@
#include <linux/blkdev.h>
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
#include <linux/gcd.h>
#include <linux/jiffies.h>
#include "blk.h"
@ -96,7 +97,11 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_segment_size = MAX_SEGMENT_SIZE;
lim->max_sectors = BLK_DEF_MAX_SECTORS;
lim->max_hw_sectors = INT_MAX;
lim->max_discard_sectors = SAFE_MAX_SECTORS;
lim->max_discard_sectors = 0;
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->discard_zeroes_data = -1;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@ -141,7 +146,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
q->nr_batching = BLK_BATCH_REQ;
q->unplug_thresh = 4; /* hmm */
q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
if (q->unplug_delay == 0)
q->unplug_delay = 1;
@ -488,6 +493,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
}
EXPORT_SYMBOL(blk_queue_stack_limits);
static unsigned int lcm(unsigned int a, unsigned int b)
{
if (a && b)
return (a * b) / gcd(a, b);
else if (b)
return b;
return a;
}
/**
* blk_stack_limits - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top)
@ -502,6 +517,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset)
{
int ret;
ret = 0;
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
@ -526,12 +545,19 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_min = max(t->io_min, b->io_min);
t->no_cluster |= b->no_cluster;
t->discard_zeroes_data &= b->discard_zeroes_data;
/* Bottom device offset aligned? */
if (offset &&
(offset & (b->physical_block_size - 1)) != b->alignment_offset) {
t->misaligned = 1;
return -1;
ret = -1;
}
if (offset &&
(offset & (b->discard_granularity - 1)) != b->discard_alignment) {
t->discard_misaligned = 1;
ret = -1;
}
/* If top has no alignment offset, inherit from bottom */
@ -539,23 +565,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->alignment_offset =
b->alignment_offset & (b->physical_block_size - 1);
if (!t->discard_alignment)
t->discard_alignment =
b->discard_alignment & (b->discard_granularity - 1);
/* Top device aligned on logical block boundary? */
if (t->alignment_offset & (t->logical_block_size - 1)) {
t->misaligned = 1;
return -1;
ret = -1;
}
/* Find lcm() of optimal I/O size */
if (t->io_opt && b->io_opt)
t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt);
else if (b->io_opt)
t->io_opt = b->io_opt;
/* Find lcm() of optimal I/O size and granularity */
t->io_opt = lcm(t->io_opt, b->io_opt);
t->discard_granularity = lcm(t->discard_granularity,
b->discard_granularity);
/* Verify that optimal I/O size is a multiple of io_min */
if (t->io_min && t->io_opt % t->io_min)
return -1;
ret = -1;
return 0;
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);

View File

@ -126,6 +126,21 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
return queue_var_show(queue_io_opt(q), page);
}
static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
{
return queue_var_show(q->limits.discard_granularity, page);
}
static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
{
return queue_var_show(q->limits.max_discard_sectors << 9, page);
}
static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_discard_zeroes_data(q), page);
}
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@ -293,6 +308,21 @@ static struct queue_sysfs_entry queue_io_opt_entry = {
.show = queue_io_opt_show,
};
static struct queue_sysfs_entry queue_discard_granularity_entry = {
.attr = {.name = "discard_granularity", .mode = S_IRUGO },
.show = queue_discard_granularity_show,
};
static struct queue_sysfs_entry queue_discard_max_entry = {
.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
.show = queue_discard_max_show,
};
static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
.show = queue_discard_zeroes_data_show,
};
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_nonrot_show,
@ -328,6 +358,9 @@ static struct attribute *default_attrs[] = {
&queue_physical_block_size_entry.attr,
&queue_io_min_entry.attr,
&queue_io_opt_entry.attr,
&queue_discard_granularity_entry.attr,
&queue_discard_max_entry.attr,
&queue_discard_zeroes_data_entry.attr,
&queue_nonrot_entry.attr,
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,

View File

@ -15,6 +15,7 @@
#include <linux/blkdev.h>
#include <linux/poll.h>
#include <linux/cdev.h>
#include <linux/jiffies.h>
#include <linux/percpu.h>
#include <linux/uio.h>
#include <linux/idr.h>
@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
rq->cmd_len = hdr->request_len;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
rq->timeout = (hdr->timeout * HZ) / 1000;
rq->timeout = msecs_to_jiffies(hdr->timeout);
if (!rq->timeout)
rq->timeout = q->sg_timeout;
if (!rq->timeout)

File diff suppressed because it is too large Load Diff

View File

@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_put_uint(arg, bdev_io_opt(bdev));
case BLKALIGNOFF:
return compat_put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKFLSBUF:
case BLKROSET:
case BLKDISCARD:

View File

@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name)
spin_unlock(&elv_list_lock);
if (!strcmp(name, "anticipatory"))
sprintf(elv, "as-iosched");
else
sprintf(elv, "%s-iosched", name);
sprintf(elv, "%s-iosched", name);
request_module("%s", elv);
spin_lock(&elv_list_lock);
@ -193,10 +190,7 @@ static int __init elevator_setup(char *str)
* Be backwards-compatible with previous kernels, so users
* won't get the wrong elevator.
*/
if (!strcmp(str, "as"))
strcpy(chosen_elevator, "anticipatory");
else
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
return 1;
}

View File

@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
}
static ssize_t disk_discard_alignment_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue));
}
static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,
NULL);
static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_ro.attr,
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_discard_alignment.attr,
&dev_attr_capability.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,

View File

@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return put_uint(arg, bdev_io_opt(bdev));
case BLKALIGNOFF:
return put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
return put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKSECTGET:
return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
case BLKRASET:

View File

@ -35,7 +35,9 @@
struct blk_cmd_filter {
unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
} blk_default_cmd_filter;
};
static struct blk_cmd_filter blk_default_cmd_filter;
/* Command group 3 is reserved and should never be used. */
const unsigned char scsi_command_size_tbl[8] =
@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
}
EXPORT_SYMBOL(scsi_cmd_ioctl);
int __init blk_scsi_ioctl_init(void)
static int __init blk_scsi_ioctl_init(void)
{
blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
return 0;

View File

@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
instead, which can be configured to be on-disk compatible with the
cryptoloop device.
source "drivers/block/drbd/Kconfig"
config BLK_DEV_NBD
tristate "Network block device support"
depends on NET

View File

@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
swim_mod-objs := swim.o swim_asm.o

View File

@ -179,19 +179,17 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
static int deregister_disk(ctlr_info_t *h, int drv_index,
int clear_all, int via_ioctl);
static void cciss_read_capacity(int ctlr, int logvol, int withirq,
static void cciss_read_capacity(int ctlr, int logvol,
sector_t *total_size, unsigned int *block_size);
static void cciss_read_capacity_16(int ctlr, int logvol, int withirq,
static void cciss_read_capacity_16(int ctlr, int logvol,
sector_t *total_size, unsigned int *block_size);
static void cciss_geometry_inquiry(int ctlr, int logvol,
int withirq, sector_t total_size,
sector_t total_size,
unsigned int block_size, InquiryData_struct *inq_buff,
drive_info_struct *drv);
static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
__u32);
static void start_io(ctlr_info_t *h);
static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
__u8 page_code, unsigned char *scsi3addr, int cmd_type);
static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
__u8 page_code, unsigned char scsi3addr[],
int cmd_type);
@ -424,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf,
if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
struct seq_file *seq = file->private_data;
ctlr_info_t *h = seq->private;
int rc;
rc = cciss_engage_scsi(h->ctlr);
if (rc != 0)
err = -rc;
else
err = cciss_engage_scsi(h->ctlr);
if (err == 0)
err = length;
} else
#endif /* CONFIG_CISS_SCSI_TAPE */
@ -1657,9 +1652,11 @@ static void cciss_softirq_done(struct request *rq)
{
CommandList_struct *cmd = rq->completion_data;
ctlr_info_t *h = hba[cmd->ctlr];
SGDescriptor_struct *curr_sg = cmd->SG;
unsigned long flags;
u64bit temp64;
int i, ddir;
int sg_index = 0;
if (cmd->Request.Type.Direction == XFER_READ)
ddir = PCI_DMA_FROMDEVICE;
@ -1669,9 +1666,22 @@ static void cciss_softirq_done(struct request *rq)
/* command did not need to be retried */
/* unmap the DMA mapping for all the scatter gather elements */
for (i = 0; i < cmd->Header.SGList; i++) {
temp64.val32.lower = cmd->SG[i].Addr.lower;
temp64.val32.upper = cmd->SG[i].Addr.upper;
pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
temp64.val32.lower = cmd->SG[i].Addr.lower;
temp64.val32.upper = cmd->SG[i].Addr.upper;
pci_dma_sync_single_for_cpu(h->pdev, temp64.val,
cmd->SG[i].Len, ddir);
pci_unmap_single(h->pdev, temp64.val,
cmd->SG[i].Len, ddir);
/* Point to the next block */
curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain;
sg_index = 0;
}
temp64.val32.lower = curr_sg[sg_index].Addr.lower;
temp64.val32.upper = curr_sg[sg_index].Addr.upper;
pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
ddir);
++sg_index;
}
#ifdef CCISS_DEBUG
@ -1701,7 +1711,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
* via the inquiry page 0. Model, vendor, and rev are set to empty strings if
* they cannot be read.
*/
static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
static void cciss_get_device_descr(int ctlr, int logvol,
char *vendor, char *model, char *rev)
{
int rc;
@ -1717,14 +1727,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
return;
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
if (withirq)
rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
sizeof(InquiryData_struct), 0,
scsi3addr, TYPE_CMD);
else
rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
sizeof(InquiryData_struct), 0,
scsi3addr, TYPE_CMD);
rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0,
scsi3addr, TYPE_CMD);
if (rc == IO_OK) {
memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
vendor[VENDOR_LEN] = '\0';
@ -1743,7 +1747,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
* number cannot be had, for whatever reason, 16 bytes of 0xff
* are returned instead.
*/
static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
static void cciss_get_serial_no(int ctlr, int logvol,
unsigned char *serial_no, int buflen)
{
#define PAGE_83_INQ_BYTES 64
@ -1759,12 +1763,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
return;
memset(serial_no, 0, buflen);
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
if (withirq)
rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
else
rc = sendcmd(CISS_INQUIRY, ctlr, buf,
PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
if (rc == IO_OK)
memcpy(serial_no, &buf[8], buflen);
kfree(buf);
@ -1793,10 +1793,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
/* This is a hardware imposed limit. */
blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES);
blk_queue_max_hw_segments(disk->queue, h->maxsgentries);
/* This is a limit in the driver and could be eliminated. */
blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES);
blk_queue_max_phys_segments(disk->queue, h->maxsgentries);
blk_queue_max_sectors(disk->queue, h->cciss_max_sectors);
@ -1852,18 +1852,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
/* testing to see if 16-byte CDBs are already being used */
if (h->cciss_read == CCISS_READ_16) {
cciss_read_capacity_16(h->ctlr, drv_index, 1,
cciss_read_capacity_16(h->ctlr, drv_index,
&total_size, &block_size);
} else {
cciss_read_capacity(ctlr, drv_index, 1,
&total_size, &block_size);
cciss_read_capacity(ctlr, drv_index, &total_size, &block_size);
/* if read_capacity returns all F's this volume is >2TB */
/* in size so we switch to 16-byte CDB's for all */
/* read/write ops */
if (total_size == 0xFFFFFFFFULL) {
cciss_read_capacity_16(ctlr, drv_index, 1,
cciss_read_capacity_16(ctlr, drv_index,
&total_size, &block_size);
h->cciss_read = CCISS_READ_16;
h->cciss_write = CCISS_WRITE_16;
@ -1873,14 +1871,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time,
}
}
cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size,
cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size,
inq_buff, drvinfo);
drvinfo->block_size = block_size;
drvinfo->nr_blocks = total_size + 1;
cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor,
drvinfo->model, drvinfo->rev);
cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no,
sizeof(drvinfo->serial_no));
/* Save the lunid in case we deregister the disk, below. */
memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
@ -2531,6 +2529,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
case 0: return IO_OK; /* no sense */
case 1: return IO_OK; /* recovered error */
default:
if (check_for_unit_attention(h, c))
return IO_NEEDS_RETRY;
printk(KERN_WARNING "cciss%d: cmd 0x%02x "
"check condition, sense key = 0x%02x\n",
h->ctlr, c->Request.CDB[0],
@ -2672,7 +2672,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
}
static void cciss_geometry_inquiry(int ctlr, int logvol,
int withirq, sector_t total_size,
sector_t total_size,
unsigned int block_size,
InquiryData_struct *inq_buff,
drive_info_struct *drv)
@ -2683,14 +2683,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
memset(inq_buff, 0, sizeof(InquiryData_struct));
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
if (withirq)
return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
inq_buff, sizeof(*inq_buff),
0xC1, scsi3addr, TYPE_CMD);
else
return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
sizeof(*inq_buff), 0xC1, scsi3addr,
TYPE_CMD);
return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff,
sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
if (return_code == IO_OK) {
if (inq_buff->data_byte[8] == 0xFF) {
printk(KERN_WARNING
@ -2723,7 +2717,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
}
static void
cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
cciss_read_capacity(int ctlr, int logvol, sector_t *total_size,
unsigned int *block_size)
{
ReadCapdata_struct *buf;
@ -2737,14 +2731,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
}
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
if (withirq)
return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
ctlr, buf, sizeof(ReadCapdata_struct),
0, scsi3addr, TYPE_CMD);
else
return_code = sendcmd(CCISS_READ_CAPACITY,
ctlr, buf, sizeof(ReadCapdata_struct),
0, scsi3addr, TYPE_CMD);
return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf,
sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
if (return_code == IO_OK) {
*total_size = be32_to_cpu(*(__be32 *) buf->total_size);
*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@ -2756,8 +2744,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
kfree(buf);
}
static void
cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size)
static void cciss_read_capacity_16(int ctlr, int logvol,
sector_t *total_size, unsigned int *block_size)
{
ReadCapdata_struct_16 *buf;
int return_code;
@ -2770,16 +2758,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
}
log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
if (withirq) {
return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
ctlr, buf, sizeof(ReadCapdata_struct_16),
0, scsi3addr, TYPE_CMD);
}
else {
return_code = sendcmd(CCISS_READ_CAPACITY_16,
ctlr, buf, sizeof(ReadCapdata_struct_16),
0, scsi3addr, TYPE_CMD);
}
return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
ctlr, buf, sizeof(ReadCapdata_struct_16),
0, scsi3addr, TYPE_CMD);
if (return_code == IO_OK) {
*total_size = be64_to_cpu(*(__be64 *) buf->total_size);
*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@ -2820,13 +2801,13 @@ static int cciss_revalidate(struct gendisk *disk)
return 1;
}
if (h->cciss_read == CCISS_READ_10) {
cciss_read_capacity(h->ctlr, logvol, 1,
cciss_read_capacity(h->ctlr, logvol,
&total_size, &block_size);
} else {
cciss_read_capacity_16(h->ctlr, logvol, 1,
cciss_read_capacity_16(h->ctlr, logvol,
&total_size, &block_size);
}
cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size,
inq_buff, drv);
blk_queue_logical_block_size(drv->queue, drv->block_size);
@ -2836,167 +2817,6 @@ static int cciss_revalidate(struct gendisk *disk)
return 0;
}
/*
* Wait polling for a command to complete.
* The memory mapped FIFO is polled for the completion.
* Used only at init time, interrupts from the HBA are disabled.
*/
static unsigned long pollcomplete(int ctlr)
{
unsigned long done;
int i;
/* Wait (up to 20 seconds) for a command to complete */
for (i = 20 * HZ; i > 0; i--) {
done = hba[ctlr]->access.command_completed(hba[ctlr]);
if (done == FIFO_EMPTY)
schedule_timeout_uninterruptible(1);
else
return done;
}
/* Invalid address to tell caller we ran out of time */
return 1;
}
/* Send command c to controller h and poll for it to complete.
* Turns interrupts off on the board. Used at driver init time
* and during SCSI error recovery.
*/
static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
{
int i;
unsigned long complete;
int status = IO_ERROR;
u64bit buff_dma_handle;
resend_cmd1:
/* Disable interrupt on the board. */
h->access.set_intr_mask(h, CCISS_INTR_OFF);
/* Make sure there is room in the command FIFO */
/* Actually it should be completely empty at this time */
/* unless we are in here doing error handling for the scsi */
/* tape side of the driver. */
for (i = 200000; i > 0; i--) {
/* if fifo isn't full go */
if (!(h->access.fifo_full(h)))
break;
udelay(10);
printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
" waiting!\n", h->ctlr);
}
h->access.submit_command(h, c); /* Send the cmd */
do {
complete = pollcomplete(h->ctlr);
#ifdef CCISS_DEBUG
printk(KERN_DEBUG "cciss: command completed\n");
#endif /* CCISS_DEBUG */
if (complete == 1) {
printk(KERN_WARNING
"cciss cciss%d: SendCmd Timeout out, "
"No command list address returned!\n", h->ctlr);
status = IO_ERROR;
break;
}
/* Make sure it's the command we're expecting. */
if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
printk(KERN_WARNING "cciss%d: Unexpected command "
"completion.\n", h->ctlr);
continue;
}
/* It is our command. If no error, we're done. */
if (!(complete & CISS_ERROR_BIT)) {
status = IO_OK;
break;
}
/* There is an error... */
/* if data overrun or underun on Report command ignore it */
if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
(c->Request.CDB[0] == CISS_REPORT_PHYS) ||
(c->Request.CDB[0] == CISS_INQUIRY)) &&
((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
(c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
complete = c->busaddr;
status = IO_OK;
break;
}
if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
h->ctlr, c);
if (c->retry_count < MAX_CMD_RETRIES) {
printk(KERN_WARNING "cciss%d: retrying %p\n",
h->ctlr, c);
c->retry_count++;
/* erase the old error information */
memset(c->err_info, 0, sizeof(c->err_info));
goto resend_cmd1;
}
printk(KERN_WARNING "cciss%d: retried %p too many "
"times\n", h->ctlr, c);
status = IO_ERROR;
break;
}
if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
printk(KERN_WARNING "cciss%d: command could not be "
"aborted.\n", h->ctlr);
status = IO_ERROR;
break;
}
if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
status = check_target_status(h, c);
break;
}
printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
c->Request.CDB[0], c->err_info->CommandStatus);
status = IO_ERROR;
break;
} while (1);
/* unlock the data buffer from DMA */
buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
return status;
}
/*
* Send a command to the controller, and wait for it to complete.
* Used at init time, and during SCSI error recovery.
*/
static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
__u8 page_code, unsigned char *scsi3addr, int cmd_type)
{
CommandList_struct *c;
int status;
c = cmd_alloc(hba[ctlr], 1);
if (!c) {
printk(KERN_WARNING "cciss: unable to get memory");
return IO_ERROR;
}
status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
scsi3addr, cmd_type);
if (status == IO_OK)
status = sendcmd_core(hba[ctlr], c);
cmd_free(hba[ctlr], c, 1);
return status;
}
/*
* Map (physical) PCI mem into (virtual) kernel space
*/
@ -3255,9 +3075,13 @@ static void do_cciss_request(struct request_queue *q)
int seg;
struct request *creq;
u64bit temp64;
struct scatterlist tmp_sg[MAXSGENTRIES];
struct scatterlist *tmp_sg;
SGDescriptor_struct *curr_sg;
drive_info_struct *drv;
int i, dir;
int nseg = 0;
int sg_index = 0;
int chained = 0;
/* We call start_io here in case there is a command waiting on the
* queue that has not been sent.
@ -3270,13 +3094,14 @@ static void do_cciss_request(struct request_queue *q)
if (!creq)
goto startio;
BUG_ON(creq->nr_phys_segments > MAXSGENTRIES);
BUG_ON(creq->nr_phys_segments > h->maxsgentries);
if ((c = cmd_alloc(h, 1)) == NULL)
goto full;
blk_start_request(creq);
tmp_sg = h->scatter_list[c->cmdindex];
spin_unlock_irq(q->queue_lock);
c->cmd_type = CMD_RWREQ;
@ -3305,7 +3130,7 @@ static void do_cciss_request(struct request_queue *q)
(int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
#endif /* CCISS_DEBUG */
sg_init_table(tmp_sg, MAXSGENTRIES);
sg_init_table(tmp_sg, h->maxsgentries);
seg = blk_rq_map_sg(q, creq, tmp_sg);
/* get the DMA records for the setup */
@ -3314,25 +3139,70 @@ static void do_cciss_request(struct request_queue *q)
else
dir = PCI_DMA_TODEVICE;
curr_sg = c->SG;
sg_index = 0;
chained = 0;
for (i = 0; i < seg; i++) {
c->SG[i].Len = tmp_sg[i].length;
if (((sg_index+1) == (h->max_cmd_sgentries)) &&
!chained && ((seg - i) > 1)) {
nseg = seg - i;
curr_sg[sg_index].Len = (nseg) *
sizeof(SGDescriptor_struct);
curr_sg[sg_index].Ext = CCISS_SG_CHAIN;
/* Point to next chain block. */
curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain;
sg_index = 0;
chained = 1;
}
curr_sg[sg_index].Len = tmp_sg[i].length;
temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
tmp_sg[i].offset,
tmp_sg[i].length, dir);
c->SG[i].Addr.lower = temp64.val32.lower;
c->SG[i].Addr.upper = temp64.val32.upper;
c->SG[i].Ext = 0; // we are not chaining
tmp_sg[i].offset,
tmp_sg[i].length, dir);
curr_sg[sg_index].Addr.lower = temp64.val32.lower;
curr_sg[sg_index].Addr.upper = temp64.val32.upper;
curr_sg[sg_index].Ext = 0; /* we are not chaining */
++sg_index;
}
if (chained) {
int len;
curr_sg = c->SG;
sg_index = h->max_cmd_sgentries - 1;
len = curr_sg[sg_index].Len;
/* Setup pointer to next chain block.
* Fill out last element in current chain
* block with address of next chain block.
*/
temp64.val = pci_map_single(h->pdev,
h->cmd_sg_list[c->cmdindex]->sgchain,
len, dir);
h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val;
curr_sg[sg_index].Addr.lower = temp64.val32.lower;
curr_sg[sg_index].Addr.upper = temp64.val32.upper;
pci_dma_sync_single_for_device(h->pdev,
h->cmd_sg_list[c->cmdindex]->sg_chain_dma,
len, dir);
}
/* track how many SG entries we are using */
if (seg > h->maxSG)
h->maxSG = seg;
#ifdef CCISS_DEBUG
printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
blk_rq_sectors(creq), seg);
printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments "
"chained[%d]\n",
blk_rq_sectors(creq), seg, chained);
#endif /* CCISS_DEBUG */
c->Header.SGList = c->Header.SGTotal = seg;
c->Header.SGList = c->Header.SGTotal = seg + chained;
if (seg > h->max_cmd_sgentries)
c->Header.SGList = h->max_cmd_sgentries;
if (likely(blk_fs_request(creq))) {
if(h->cciss_read == CCISS_READ_10) {
c->Request.CDB[1] = 0;
@ -3513,28 +3383,33 @@ static int add_to_scan_list(struct ctlr_info *h)
* @h: Pointer to the controller.
*
* Removes the controller from the rescan queue if present. Blocks if
* the controller is currently conducting a rescan.
* the controller is currently conducting a rescan. The controller
* can be in one of three states:
* 1. Doesn't need a scan
* 2. On the scan list, but not scanning yet (we remove it)
* 3. Busy scanning (and not on the list). In this case we want to wait for
* the scan to complete to make sure the scanning thread for this
* controller is completely idle.
**/
static void remove_from_scan_list(struct ctlr_info *h)
{
struct ctlr_info *test_h, *tmp_h;
int scanning = 0;
mutex_lock(&scan_mutex);
list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
if (test_h == h) {
if (test_h == h) { /* state 2. */
list_del(&h->scan_list);
complete_all(&h->scan_wait);
mutex_unlock(&scan_mutex);
return;
}
}
if (&h->busy_scanning)
scanning = 0;
mutex_unlock(&scan_mutex);
if (scanning)
if (h->busy_scanning) { /* state 3. */
mutex_unlock(&scan_mutex);
wait_for_completion(&h->scan_wait);
} else { /* state 1, nothing to do. */
mutex_unlock(&scan_mutex);
}
}
/**
@ -3573,13 +3448,11 @@ static int scan_thread(void *data)
h->busy_scanning = 1;
mutex_unlock(&scan_mutex);
if (h) {
rebuild_lun_table(h, 0, 0);
complete_all(&h->scan_wait);
mutex_lock(&scan_mutex);
h->busy_scanning = 0;
mutex_unlock(&scan_mutex);
}
rebuild_lun_table(h, 0, 0);
complete_all(&h->scan_wait);
mutex_lock(&scan_mutex);
h->busy_scanning = 0;
mutex_unlock(&scan_mutex);
}
}
@ -3605,8 +3478,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
case REPORT_LUNS_CHANGED:
printk(KERN_WARNING "cciss%d: report LUN data "
"changed\n", h->ctlr);
add_to_scan_list(h);
wake_up_process(cciss_scan_thread);
/*
* Here, we could call add_to_scan_list and wake up the scan thread,
* except that it's quite likely that we will get more than one
* REPORT_LUNS_CHANGED condition in quick succession, which means
* that those which occur after the first one will likely happen
* *during* the scan_thread's rescan. And the rescan code is not
* robust enough to restart in the middle, undoing what it has already
* done, and it's not clear that it's even possible to do this, since
* part of what it does is notify the block layer, which starts
* doing it's own i/o to read partition tables and so on, and the
* driver doesn't have visibility to know what might need undoing.
* In any event, if possible, it is horribly complicated to get right
* so we just don't do it for now.
*
* Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
*/
return 1;
break;
case POWER_OR_RESET:
@ -3888,6 +3775,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
* leave a little room for ioctl calls.
*/
c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
c->maxsgentries = readl(&(c->cfgtable->MaxSGElements));
/*
* Limit native command to 32 s/g elements to save dma'able memory.
* Howvever spec says if 0, use 31
*/
c->max_cmd_sgentries = 31;
if (c->maxsgentries > 512) {
c->max_cmd_sgentries = 32;
c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1;
c->maxsgentries -= 1; /* account for chain pointer */
} else {
c->maxsgentries = 31; /* Default to traditional value */
c->chainsize = 0; /* traditional */
}
c->product_name = products[prod_index].product_name;
c->access = *(products[prod_index].access);
c->nr_cmds = c->max_commands - 4;
@ -4214,6 +4118,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
{
int i;
int j = 0;
int k = 0;
int rc;
int dac, return_code;
InquiryData_struct *inq_buff;
@ -4317,6 +4222,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
printk(KERN_ERR "cciss: out of memory");
goto clean4;
}
/* Need space for temp scatter list */
hba[i]->scatter_list = kmalloc(hba[i]->max_commands *
sizeof(struct scatterlist *),
GFP_KERNEL);
for (k = 0; k < hba[i]->nr_cmds; k++) {
hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) *
hba[i]->maxsgentries,
GFP_KERNEL);
if (hba[i]->scatter_list[k] == NULL) {
printk(KERN_ERR "cciss%d: could not allocate "
"s/g lists\n", i);
goto clean4;
}
}
hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) *
hba[i]->nr_cmds,
GFP_KERNEL);
if (!hba[i]->cmd_sg_list) {
printk(KERN_ERR "cciss%d: Cannot get memory for "
"s/g chaining.\n", i);
goto clean4;
}
/* Build up chain blocks for each command */
if (hba[i]->chainsize > 0) {
for (j = 0; j < hba[i]->nr_cmds; j++) {
hba[i]->cmd_sg_list[j] =
kmalloc(sizeof(struct Cmd_sg_list),
GFP_KERNEL);
if (!hba[i]->cmd_sg_list[j]) {
printk(KERN_ERR "cciss%d: Cannot get memory "
"for chain block.\n", i);
goto clean4;
}
/* Need a block of chainsized s/g elements. */
hba[i]->cmd_sg_list[j]->sgchain =
kmalloc((hba[i]->chainsize *
sizeof(SGDescriptor_struct)),
GFP_KERNEL);
if (!hba[i]->cmd_sg_list[j]->sgchain) {
printk(KERN_ERR "cciss%d: Cannot get memory "
"for s/g chains\n", i);
goto clean4;
}
}
}
spin_lock_init(&hba[i]->lock);
/* Initialize the pdev driver private data.
@ -4362,7 +4314,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
cciss_procinit(i);
hba[i]->cciss_max_sectors = 2048;
hba[i]->cciss_max_sectors = 8192;
rebuild_lun_table(hba[i], 1, 0);
hba[i]->busy_initializing = 0;
@ -4370,6 +4322,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
clean4:
kfree(hba[i]->cmd_pool_bits);
/* Free up sg elements */
for (k = 0; k < hba[i]->nr_cmds; k++)
kfree(hba[i]->scatter_list[k]);
kfree(hba[i]->scatter_list);
/* Only free up extra s/g lists if controller supports them */
if (hba[i]->chainsize > 0) {
for (j = 0; j < hba[i]->nr_cmds; j++) {
if (hba[i]->cmd_sg_list[j]) {
kfree(hba[i]->cmd_sg_list[j]->sgchain);
kfree(hba[i]->cmd_sg_list[j]);
}
}
kfree(hba[i]->cmd_sg_list);
}
if (hba[i]->cmd_pool)
pci_free_consistent(hba[i]->pdev,
hba[i]->nr_cmds * sizeof(CommandList_struct),
@ -4400,30 +4366,28 @@ clean_no_release_regions:
static void cciss_shutdown(struct pci_dev *pdev)
{
ctlr_info_t *tmp_ptr;
int i;
char flush_buf[4];
ctlr_info_t *h;
char *flush_buf;
int return_code;
tmp_ptr = pci_get_drvdata(pdev);
if (tmp_ptr == NULL)
h = pci_get_drvdata(pdev);
flush_buf = kzalloc(4, GFP_KERNEL);
if (!flush_buf) {
printk(KERN_WARNING
"cciss:%d cache not flushed, out of memory.\n",
h->ctlr);
return;
i = tmp_ptr->ctlr;
if (hba[i] == NULL)
return;
/* Turn board interrupts off and send the flush cache command */
/* sendcmd will turn off interrupt, and send the flush...
* To write all data in the battery backed cache to disks */
memset(flush_buf, 0, 4);
return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
CTLR_LUNID, TYPE_CMD);
if (return_code == IO_OK) {
printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
} else {
printk(KERN_WARNING "Error flushing cache on controller %d\n", i);
}
free_irq(hba[i]->intr[2], hba[i]);
/* write all data in the battery backed cache to disk */
memset(flush_buf, 0, 4);
return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf,
4, 0, CTLR_LUNID, TYPE_CMD);
kfree(flush_buf);
if (return_code != IO_OK)
printk(KERN_WARNING "cciss%d: Error flushing cache\n",
h->ctlr);
h->access.set_intr_mask(h, CCISS_INTR_OFF);
free_irq(h->intr[2], h);
}
static void __devexit cciss_remove_one(struct pci_dev *pdev)
@ -4485,6 +4449,20 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
kfree(hba[i]->cmd_pool_bits);
/* Free up sg elements */
for (j = 0; j < hba[i]->nr_cmds; j++)
kfree(hba[i]->scatter_list[j]);
kfree(hba[i]->scatter_list);
/* Only free up extra s/g lists if controller supports them */
if (hba[i]->chainsize > 0) {
for (j = 0; j < hba[i]->nr_cmds; j++) {
if (hba[i]->cmd_sg_list[j]) {
kfree(hba[i]->cmd_sg_list[j]->sgchain);
kfree(hba[i]->cmd_sg_list[j]);
}
}
kfree(hba[i]->cmd_sg_list);
}
/*
* Deliberately omit pci_disable_device(): it does something nasty to
* Smart Array controllers that pci_enable_device does not undo

View File

@ -55,7 +55,13 @@ typedef struct _drive_info_struct
char device_initialized; /* indicates whether dev is initialized */
} drive_info_struct;
struct ctlr_info
struct Cmd_sg_list {
SGDescriptor_struct *sgchain;
dma_addr_t sg_chain_dma;
int chain_block_size;
};
struct ctlr_info
{
int ctlr;
char devname[8];
@ -75,6 +81,16 @@ struct ctlr_info
int num_luns;
int highest_lun;
int usage_count; /* number of opens all all minor devices */
/* Need space for temp sg list
* number of scatter/gathers supported
* number of scatter/gathers in chained block
*/
struct scatterlist **scatter_list;
int maxsgentries;
int chainsize;
int max_cmd_sgentries;
struct Cmd_sg_list **cmd_sg_list;
# define DOORBELL_INT 0
# define PERF_MODE_INT 1
# define SIMPLE_MODE_INT 2

View File

@ -7,7 +7,8 @@
//general boundary defintions
#define SENSEINFOBYTES 32//note that this value may vary between host implementations
#define MAXSGENTRIES 31
#define MAXSGENTRIES 32
#define CCISS_SG_CHAIN 0x80000000
#define MAXREPLYQS 256
//Command Status value
@ -319,6 +320,10 @@ typedef struct _CfgTable_struct {
BYTE ServerName[16];
DWORD HeartBeat;
DWORD SCSI_Prefetch;
DWORD MaxSGElements;
DWORD MaxLogicalUnits;
DWORD MaxPhysicalDrives;
DWORD MaxPhysicalDrivesPerLogicalUnit;
} CfgTable_struct;
#pragma pack()
#endif // CCISS_CMD_H

View File

@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag)
cp,
ei->ScsiStatus);
#endif
cmd->result |= (ei->ScsiStatus < 1);
cmd->result |= (ei->ScsiStatus << 1);
}
else { /* scsi status is zero??? How??? */
@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr)
if (sa->registered) {
printk("cciss%d: SCSI subsystem already engaged.\n", ctlr);
spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
return ENXIO;
return -ENXIO;
}
sa->registered = 1;
spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);

View File

@ -0,0 +1,71 @@
#
# DRBD device driver configuration
#
comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
depends on !PROC_FS || !INET || !CONNECTOR
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
depends on PROC_FS && INET && CONNECTOR
select LRU_CACHE
default n
help
NOTE: In order to authenticate connections you have to select
CRYPTO_HMAC and a hash function as well.
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Each minor device has a role, which can be 'primary' or 'secondary'.
On the node with the primary device the application is supposed to
run and to access the device (/dev/drbdX). Every write is sent to
the local 'lower level block device' and, across the network, to the
node with the device in 'secondary' state. The secondary device
simply writes the data to its lower level block device.
DRBD can also be used in dual-Primary mode (device writable on both
nodes), which means it can exhibit shared disk semantics in a
shared-nothing cluster. Needless to say, on top of dual-Primary
DRBD utilizing a cluster file system is necessary to maintain for
cache coherency.
For automatic failover you need a cluster manager (e.g. heartbeat).
See also: http://www.drbd.org/, http://www.linux-ha.org
If unsure, say N.
config DRBD_FAULT_INJECTION
bool "DRBD fault injection"
depends on BLK_DEV_DRBD
help
Say Y here if you want to simulate IO errors, in order to test DRBD's
behavior.
The actual simulation of IO errors is done by writing 3 values to
/sys/module/drbd/parameters/
enable_faults: bitmask of...
1 meta data write
2 read
4 resync data write
8 read
16 data write
32 data read
64 read ahead
128 kmalloc of bitmap
256 allocation of EE (epoch_entries)
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent
Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
echo 16 > /sys/module/drbd/parameters/enable_faults
echo 1 > /sys/module/drbd/parameters/fault_devs
echo 5 > /sys/module/drbd/parameters/fault_rate
If unsure, say N.

View File

@ -0,0 +1,5 @@
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

2364
drivers/block/drbd/drbd_nl.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,265 @@
/*
drbd_proc.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/module.h>
#include <asm/uaccess.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/drbd.h>
#include "drbd_int.h"
static int drbd_proc_open(struct inode *inode, struct file *file);
struct proc_dir_entry *drbd_proc;
struct file_operations drbd_proc_fops = {
.owner = THIS_MODULE,
.open = drbd_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/*lge
* progress bars shamelessly adapted from driver/md/md.c
* output looks like
* [=====>..............] 33.5% (23456/123456)
* finish: 2:20:20 speed: 6,345 (6,456) K/sec
*/
static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
{
unsigned long db, dt, dbdt, rt, rs_left;
unsigned int res;
int i, x, y;
drbd_get_syncer_progress(mdev, &rs_left, &res);
x = res/50;
y = 20-x;
seq_printf(seq, "\t[");
for (i = 1; i < x; i++)
seq_printf(seq, "=");
seq_printf(seq, ">");
for (i = 0; i < y; i++)
seq_printf(seq, ".");
seq_printf(seq, "] ");
seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
/* if more than 1 GB display in MB */
if (mdev->rs_total > 0x100000L)
seq_printf(seq, "(%lu/%lu)M\n\t",
(unsigned long) Bit2KB(rs_left >> 10),
(unsigned long) Bit2KB(mdev->rs_total >> 10));
else
seq_printf(seq, "(%lu/%lu)K\n\t",
(unsigned long) Bit2KB(rs_left),
(unsigned long) Bit2KB(mdev->rs_total));
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
* the * 100 / 100 trick are important. We do a +1 to be
* safe against division by zero. We only estimate anyway.
*
* dt: time from mark until now
* db: blocks written from mark until now
* rt: remaining time
*/
dt = (jiffies - mdev->rs_mark_time) / HZ;
if (dt > 20) {
/* if we made no update to rs_mark_time for too long,
* we are stalled. show that. */
seq_printf(seq, "stalled\n");
return;
}
if (!dt)
dt++;
db = mdev->rs_mark_left - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
rt / 3600, (rt % 3600) / 60, rt % 60);
/* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
dbdt = Bit2KB(db/dt);
if (dbdt > 1000)
seq_printf(seq, " speed: %ld,%03ld",
dbdt/1000, dbdt % 1000);
else
seq_printf(seq, " speed: %ld", dbdt);
/* mean speed since syncer started
* we do account for PausedSync periods */
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
if (dt <= 0)
dt = 1;
db = mdev->rs_total - rs_left;
dbdt = Bit2KB(db/dt);
if (dbdt > 1000)
seq_printf(seq, " (%ld,%03ld)",
dbdt/1000, dbdt % 1000);
else
seq_printf(seq, " (%ld)", dbdt);
seq_printf(seq, " K/sec\n");
}
static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
{
struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
seq_printf(seq, "%5d %s %s\n", bme->rs_left,
bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
bme->flags & BME_LOCKED ? "LOCKED" : "------"
);
}
static int drbd_seq_show(struct seq_file *seq, void *v)
{
int i, hole = 0;
const char *sn;
struct drbd_conf *mdev;
static char write_ordering_chars[] = {
[WO_none] = 'n',
[WO_drain_io] = 'd',
[WO_bdev_flush] = 'f',
[WO_bio_barrier] = 'b',
};
seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
/*
cs .. connection state
ro .. node role (local/remote)
ds .. disk state (local/remote)
protocol
various flags
ns .. network send
nr .. network receive
dw .. disk write
dr .. disk read
al .. activity log write count
bm .. bitmap update write count
pe .. pending (waiting for ack or data reply)
ua .. unack'd (still need to send ack or data reply)
ap .. application requests accepted, but not yet completed
ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
wo .. write ordering mode currently in use
oos .. known out-of-sync kB
*/
for (i = 0; i < minor_count; i++) {
mdev = minor_to_mdev(i);
if (!mdev) {
hole = 1;
continue;
}
if (hole) {
hole = 0;
seq_printf(seq, "\n");
}
sn = drbd_conn_str(mdev->state.conn);
if (mdev->state.conn == C_STANDALONE &&
mdev->state.disk == D_DISKLESS &&
mdev->state.role == R_SECONDARY) {
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
drbd_role_str(mdev->state.role),
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
mdev->state.susp ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
mdev->read_cnt/2,
mdev->al_writ_cnt,
mdev->bm_writ_cnt,
atomic_read(&mdev->local_cnt),
atomic_read(&mdev->ap_pending_cnt) +
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
mdev->epochs,
write_ordering_chars[mdev->write_ordering]
);
seq_printf(seq, " oos:%lu\n",
Bit2KB(drbd_bm_total_weight(mdev)));
}
if (mdev->state.conn == C_SYNC_SOURCE ||
mdev->state.conn == C_SYNC_TARGET)
drbd_syncer_progress(mdev, seq);
if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
seq_printf(seq, "\t%3d%% %lu/%lu\n",
(int)((mdev->rs_total-mdev->ov_left) /
(mdev->rs_total/100+1)),
mdev->rs_total - mdev->ov_left,
mdev->rs_total);
if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
lc_seq_printf_stats(seq, mdev->resync);
lc_seq_printf_stats(seq, mdev->act_log);
put_ldev(mdev);
}
if (proc_details >= 2) {
if (mdev->resync) {
lc_seq_dump_details(seq, mdev->resync, "rs_left",
resync_dump_detail);
}
}
}
return 0;
}
static int drbd_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, drbd_seq_show, PDE(inode)->data);
}
/* PROC FS stuff end */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,326 @@
/*
drbd_req.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
DRBD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
DRBD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_REQ_H
#define _DRBD_REQ_H
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/drbd.h>
#include "drbd_int.h"
#include "drbd_wrappers.h"
/* The request callbacks will be called in irq context by the IDE drivers,
and in Softirqs/Tasklets/BH context by the SCSI drivers,
and by the receiver and worker in kernel-thread context.
Try to get the locking right :) */
/*
* Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
* associated with IO requests originating from the block layer above us.
*
* There are quite a few things that may happen to a drbd request
* during its lifetime.
*
* It will be created.
* It will be marked with the intention to be
* submitted to local disk and/or
* send via the network.
*
* It has to be placed on the transfer log and other housekeeping lists,
* In case we have a network connection.
*
* It may be identified as a concurrent (write) request
* and be handled accordingly.
*
* It may me handed over to the local disk subsystem.
* It may be completed by the local disk subsystem,
* either sucessfully or with io-error.
* In case it is a READ request, and it failed locally,
* it may be retried remotely.
*
* It may be queued for sending.
* It may be handed over to the network stack,
* which may fail.
* It may be acknowledged by the "peer" according to the wire_protocol in use.
* this may be a negative ack.
* It may receive a faked ack when the network connection is lost and the
* transfer log is cleaned up.
* Sending may be canceled due to network connection loss.
* When it finally has outlived its time,
* corresponding dirty bits in the resync-bitmap may be cleared or set,
* it will be destroyed,
* and completion will be signalled to the originator,
* with or without "success".
*/
enum drbd_req_event {
created,
to_be_send,
to_be_submitted,
/* XXX yes, now I am inconsistent...
* these two are not "events" but "actions"
* oh, well... */
queue_for_net_write,
queue_for_net_read,
send_canceled,
send_failed,
handed_over_to_network,
connection_lost_while_pending,
recv_acked_by_peer,
write_acked_by_peer,
write_acked_by_peer_and_sis, /* and set_in_sync */
conflict_discarded_by_peer,
neg_acked,
barrier_acked, /* in protocol A and B */
data_received, /* (remote read) */
read_completed_with_error,
read_ahead_completed_with_error,
write_completed_with_error,
completed_ok,
nothing, /* for tracing only */
};
/* encoding of request states for now. we don't actually need that many bits.
* we don't need to do atomic bit operations either, since most of the time we
* need to look at the connection state and/or manipulate some lists at the
* same time, so we should hold the request lock anyways.
*/
enum drbd_req_state_bits {
/* 210
* 000: no local possible
* 001: to be submitted
* UNUSED, we could map: 011: submitted, completion still pending
* 110: completed ok
* 010: completed with error
*/
__RQ_LOCAL_PENDING,
__RQ_LOCAL_COMPLETED,
__RQ_LOCAL_OK,
/* 76543
* 00000: no network possible
* 00001: to be send
* 00011: to be send, on worker queue
* 00101: sent, expecting recv_ack (B) or write_ack (C)
* 11101: sent,
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
* 11100: write_acked (C),
* data_received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
* or neg-d-acked (read, any protocol)
* or killed from the transfer log
* during cleanup after connection loss
* request can be freed
* 01000: canceled or send failed...
* request can be freed
*/
/* if "SENT" is not set, yet, this can still fail or be canceled.
* if "SENT" is set already, we still wait for an Ack packet.
* when cleared, the master_bio may be completed.
* in (B,A) the request object may still linger on the transaction log
* until the corresponding barrier ack comes in */
__RQ_NET_PENDING,
/* If it is QUEUED, and it is a WRITE, it is also registered in the
* transfer log. Currently we need this flag to avoid conflicts between
* worker canceling the request and tl_clear_barrier killing it from
* transfer log. We should restructure the code so this conflict does
* no longer occur. */
__RQ_NET_QUEUED,
/* well, actually only "handed over to the network stack".
*
* TODO can potentially be dropped because of the similar meaning
* of RQ_NET_SENT and ~RQ_NET_QUEUED.
* however it is not exactly the same. before we drop it
* we must ensure that we can tell a request with network part
* from a request without, regardless of what happens to it. */
__RQ_NET_SENT,
/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
* basically this means the corresponding P_BARRIER_ACK was received */
__RQ_NET_DONE,
/* whether or not we know (C) or pretend (B,A) that the write
* was successfully written on the peer.
*/
__RQ_NET_OK,
/* peer called drbd_set_in_sync() for this write */
__RQ_NET_SIS,
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
#define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->ee_hash_s == 0);
return mdev->ee_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
}
/* transfer log (drbd_request objects) */
static inline
struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->tl_hash_s == 0);
return mdev->tl_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
}
/* application reads (drbd_request objects) */
static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
return mdev->app_reads_hash
+ ((unsigned int)(sector) % APP_R_HSIZE);
}
/* when we receive the answer for a read request,
* verify that we actually know about it */
static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
u64 id, sector_t sector)
{
struct hlist_head *slot = ar_hash_slot(mdev, sector);
struct hlist_node *n;
struct drbd_request *req;
hlist_for_each_entry(req, n, slot, colision) {
if ((unsigned long)req == (unsigned long)id) {
D_ASSERT(req->sector == sector);
return req;
}
}
return NULL;
}
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct bio *bio;
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
req->rq_state = 0;
req->mdev = mdev;
req->master_bio = bio_src;
req->private_bio = bio;
req->epoch = 0;
req->sector = bio->bi_sector;
req->size = bio->bi_size;
req->start_time = jiffies;
INIT_HLIST_NODE(&req->colision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_next = NULL;
}
return req;
}
static inline void drbd_req_free(struct drbd_request *req)
{
mempool_free(req, drbd_request_mempool);
}
static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
{
return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_size, or similar. But that would be too ugly. */
struct bio_and_error {
struct bio *bio;
int error;
};
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
/* __req_mod possibly frees req, do not touch req after that! */
__req_mod(req, what, &m);
if (m.bio)
complete_master_bio(mdev, &m);
}
/* completion of master bio is outside of spinlock.
* If you need it irqsave, do it your self! */
static inline void req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
spin_lock_irq(&mdev->req_lock);
__req_mod(req, what, &m);
spin_unlock_irq(&mdev->req_lock);
if (m.bio)
complete_master_bio(mdev, &m);
}
#endif

View File

@ -0,0 +1,113 @@
/*
drbd.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/drbd.h>
static const char *drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
[C_DISCONNECTING] = "Disconnecting",
[C_UNCONNECTED] = "Unconnected",
[C_TIMEOUT] = "Timeout",
[C_BROKEN_PIPE] = "BrokenPipe",
[C_NETWORK_FAILURE] = "NetworkFailure",
[C_PROTOCOL_ERROR] = "ProtocolError",
[C_WF_CONNECTION] = "WFConnection",
[C_WF_REPORT_PARAMS] = "WFReportParams",
[C_TEAR_DOWN] = "TearDown",
[C_CONNECTED] = "Connected",
[C_STARTING_SYNC_S] = "StartingSyncS",
[C_STARTING_SYNC_T] = "StartingSyncT",
[C_WF_BITMAP_S] = "WFBitMapS",
[C_WF_BITMAP_T] = "WFBitMapT",
[C_WF_SYNC_UUID] = "WFSyncUUID",
[C_SYNC_SOURCE] = "SyncSource",
[C_SYNC_TARGET] = "SyncTarget",
[C_PAUSED_SYNC_S] = "PausedSyncS",
[C_PAUSED_SYNC_T] = "PausedSyncT",
[C_VERIFY_S] = "VerifyS",
[C_VERIFY_T] = "VerifyT",
};
static const char *drbd_role_s_names[] = {
[R_PRIMARY] = "Primary",
[R_SECONDARY] = "Secondary",
[R_UNKNOWN] = "Unknown"
};
static const char *drbd_disk_s_names[] = {
[D_DISKLESS] = "Diskless",
[D_ATTACHING] = "Attaching",
[D_FAILED] = "Failed",
[D_NEGOTIATING] = "Negotiating",
[D_INCONSISTENT] = "Inconsistent",
[D_OUTDATED] = "Outdated",
[D_UNKNOWN] = "DUnknown",
[D_CONSISTENT] = "Consistent",
[D_UP_TO_DATE] = "UpToDate",
};
static const char *drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
[-SS_DEVICE_IN_USE] = "Device is held open by someone",
[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
};
const char *drbd_conn_str(enum drbd_conns s)
{
/* enums are unsigned... */
return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
}
const char *drbd_role_str(enum drbd_role s)
{
return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
}
const char *drbd_disk_str(enum drbd_disk_state s)
{
return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
}
const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
{
return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
err > SS_TWO_PRIMARIES ? "TOO_LARGE"
: drbd_state_sw_errors[-err];
}

View File

@ -0,0 +1,351 @@
/*
-*- linux-c -*-
drbd_receiver.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_VLI_H
#define _DRBD_VLI_H
/*
* At a granularity of 4KiB storage represented per bit,
* and stroage sizes of several TiB,
* and possibly small-bandwidth replication,
* the bitmap transfer time can take much too long,
* if transmitted in plain text.
*
* We try to reduce the transfered bitmap information
* by encoding runlengths of bit polarity.
*
* We never actually need to encode a "zero" (runlengths are positive).
* But then we have to store the value of the first bit.
* The first bit of information thus shall encode if the first runlength
* gives the number of set or unset bits.
*
* We assume that large areas are either completely set or unset,
* which gives good compression with any runlength method,
* even when encoding the runlength as fixed size 32bit/64bit integers.
*
* Still, there may be areas where the polarity flips every few bits,
* and encoding the runlength sequence of those areas with fix size
* integers would be much worse than plaintext.
*
* We want to encode small runlength values with minimum code length,
* while still being able to encode a Huge run of all zeros.
*
* Thus we need a Variable Length Integer encoding, VLI.
*
* For some cases, we produce more code bits than plaintext input.
* We need to send incompressible chunks as plaintext, skip over them
* and then see if the next chunk compresses better.
*
* We don't care too much about "excellent" compression ratio for large
* runlengths (all set/all clear): whether we achieve a factor of 100
* or 1000 is not that much of an issue.
* We do not want to waste too much on short runlengths in the "noisy"
* parts of the bitmap, though.
*
* There are endless variants of VLI, we experimented with:
* * simple byte-based
* * various bit based with different code word length.
*
* To avoid yet an other configuration parameter (choice of bitmap compression
* algorithm) which was difficult to explain and tune, we just chose the one
* variant that turned out best in all test cases.
* Based on real world usage patterns, with device sizes ranging from a few GiB
* to several TiB, file server/mailserver/webserver/mysql/postgress,
* mostly idle to really busy, the all time winner (though sometimes only
* marginally better) is:
*/
/*
* encoding is "visualised" as
* __little endian__ bitstream, least significant bit first (left most)
*
* this particular encoding is chosen so that the prefix code
* starts as unary encoding the level, then modified so that
* 10 levels can be described in 8bit, with minimal overhead
* for the smaller levels.
*
* Number of data bits follow fibonacci sequence, with the exception of the
* last level (+1 data bit, so it makes 64bit total). The only worse code when
* encoding bit polarity runlength is 1 plain bits => 2 code bits.
prefix data bits max val data bits
0 x 0x2 1
10 x 0x4 1
110 xx 0x8 2
1110 xxx 0x10 3
11110 xxx xx 0x30 5
111110 xx xxxxxx 0x130 8
11111100 xxxxxxxx xxxxx 0x2130 13
11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
* maximum encodable value: 0x100000400202130 == 2**56 + some */
/* compression "table":
transmitted x 0.29
as plaintext x ........................
x ........................
x ........................
x 0.59 0.21........................
x ........................................................
x .. c ...................................................
x 0.44.. o ...................................................
x .......... d ...................................................
x .......... e ...................................................
X............. ...................................................
x.............. b ...................................................
2.0x............... i ...................................................
#X................ t ...................................................
#................. s ........................... plain bits ..........
-+-----------------------------------------------------------------------
1 16 32 64
*/
/* LEVEL: (total bits, prefix bits, prefix value),
* sorted ascending by number of total bits.
* The rest of the code table is calculated at compiletime from this. */
/* fibonacci data 1, 1, ... */
#define VLI_L_1_1() do { \
LEVEL( 2, 1, 0x00); \
LEVEL( 3, 2, 0x01); \
LEVEL( 5, 3, 0x03); \
LEVEL( 7, 4, 0x07); \
LEVEL(10, 5, 0x0f); \
LEVEL(14, 6, 0x1f); \
LEVEL(21, 8, 0x3f); \
LEVEL(29, 8, 0x7f); \
LEVEL(42, 8, 0xbf); \
LEVEL(64, 8, 0xff); \
} while (0)
/* finds a suitable level to decode the least significant part of in.
* returns number of bits consumed.
*
* BUG() for bad input, as that would mean a buggy code table. */
static inline int vli_decode_bits(u64 *out, const u64 in)
{
u64 adj = 1;
#define LEVEL(t,b,v) \
do { \
if ((in & ((1 << b) -1)) == v) { \
*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
return t; \
} \
adj += 1ULL << (t - b); \
} while (0)
VLI_L_1_1();
/* NOT REACHED, if VLI_LEVELS code table is defined properly */
BUG();
#undef LEVEL
}
/* return number of code bits needed,
* or negative error number */
static inline int __vli_encode_bits(u64 *out, const u64 in)
{
u64 max = 0;
u64 adj = 1;
if (in == 0)
return -EINVAL;
#define LEVEL(t,b,v) do { \
max += 1ULL << (t - b); \
if (in <= max) { \
if (out) \
*out = ((in - adj) << b) | v; \
return t; \
} \
adj = max + 1; \
} while (0)
VLI_L_1_1();
return -EOVERFLOW;
#undef LEVEL
}
#undef VLI_L_1_1
/* code from here down is independend of actually used bit code */
/*
* Code length is determined by some unique (e.g. unary) prefix.
* This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
* not a byte stream.
*/
/* for the bitstream, we need a cursor */
struct bitstream_cursor {
/* the current byte */
u8 *b;
/* the current bit within *b, nomalized: 0..7 */
unsigned int bit;
};
/* initialize cursor to point to first bit of stream */
static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
{
cur->b = s;
cur->bit = 0;
}
/* advance cursor by that many bits; maximum expected input value: 64,
* but depending on VLI implementation, it may be more. */
static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
{
bits += cur->bit;
cur->b = cur->b + (bits >> 3);
cur->bit = bits & 7;
}
/* the bitstream itself knows its length */
struct bitstream {
struct bitstream_cursor cur;
unsigned char *buf;
size_t buf_len; /* in bytes */
/* for input stream:
* number of trailing 0 bits for padding
* total number of valid bits in stream: buf_len * 8 - pad_bits */
unsigned int pad_bits;
};
static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
{
bs->buf = s;
bs->buf_len = len;
bs->pad_bits = pad_bits;
bitstream_cursor_reset(&bs->cur, bs->buf);
}
static inline void bitstream_rewind(struct bitstream *bs)
{
bitstream_cursor_reset(&bs->cur, bs->buf);
memset(bs->buf, 0, bs->buf_len);
}
/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
* Ignores "pad_bits".
* Returns zero if bits == 0 (nothing to do).
* Returns number of bits used if successful.
*
* If there is not enough room left in bitstream,
* leaves bitstream unchanged and returns -ENOBUFS.
*/
static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
{
unsigned char *b = bs->cur.b;
unsigned int tmp;
if (bits == 0)
return 0;
if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
return -ENOBUFS;
/* paranoia: strip off hi bits; they should not be set anyways. */
if (bits < 64)
val &= ~0ULL >> (64 - bits);
*b++ |= (val & 0xff) << bs->cur.bit;
for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
*b++ |= (val >> tmp) & 0xff;
bitstream_cursor_advance(&bs->cur, bits);
return bits;
}
/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
*
* If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
*
* If there are less than the requested number of valid bits left in the
* bitstream, still fetches all available bits.
*
* Returns number of actually fetched bits.
*/
static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
{
u64 val;
unsigned int n;
if (bits > 64)
return -EINVAL;
if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
- bs->cur.bit - bs->pad_bits;
if (bits == 0) {
*out = 0;
return 0;
}
/* get the high bits */
val = 0;
n = (bs->cur.bit + bits + 7) >> 3;
/* n may be at most 9, if cur.bit + bits > 64 */
/* which means this copies at most 8 byte */
if (n) {
memcpy(&val, bs->cur.b+1, n - 1);
val = le64_to_cpu(val) << (8 - bs->cur.bit);
}
/* we still need the low bits */
val |= bs->cur.b[0] >> bs->cur.bit;
/* and mask out bits we don't want */
val &= ~0ULL >> (64 - bits);
bitstream_cursor_advance(&bs->cur, bits);
*out = val;
return bits;
}
/* encodes @in as vli into @bs;
* return values
* > 0: number of bits successfully stored in bitstream
* -ENOBUFS @bs is full
* -EINVAL input zero (invalid)
* -EOVERFLOW input too large for this vli code (invalid)
*/
static inline int vli_encode_bits(struct bitstream *bs, u64 in)
{
u64 code = code;
int bits = __vli_encode_bits(&code, in);
if (bits <= 0)
return bits;
return bitstream_put_bits(bs, code, bits);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,91 @@
#ifndef _DRBD_WRAPPERS_H
#define _DRBD_WRAPPERS_H
#include <linux/ctype.h>
#include <linux/mm.h>
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
/* sets the number of 512 byte sectors of our virtual device */
static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
sector_t size)
{
/* set_capacity(mdev->this_bdev->bd_disk, size); */
set_capacity(mdev->vdisk, size);
mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
}
#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
static inline int drbd_bio_has_active_page(struct bio *bio)
{
struct bio_vec *bvec;
int i;
__bio_for_each_segment(bvec, bio, i, 0) {
if (page_count(bvec->bv_page) > 1)
return 1;
}
return 0;
}
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
extern void drbd_endio_read_sec(struct bio *bio, int error);
extern void drbd_endio_write_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
/*
* used to submit our private bio
*/
static inline void drbd_generic_make_request(struct drbd_conf *mdev,
int fault_type, struct bio *bio)
{
__release(local);
if (!bio->bi_bdev) {
printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
"bio->bi_bdev == NULL\n",
mdev_to_minor(mdev));
dump_stack();
bio_endio(bio, -ENODEV);
return;
}
if (FAULT_ACTIVE(mdev, fault_type))
bio_endio(bio, -EIO);
else
generic_make_request(bio);
}
static inline void drbd_plug_device(struct drbd_conf *mdev)
{
struct request_queue *q;
q = bdev_get_queue(mdev->this_bdev);
spin_lock_irq(q->queue_lock);
/* XXX the check on !blk_queue_plugged is redundant,
* implicitly checked in blk_plug_device */
if (!blk_queue_plugged(q)) {
blk_plug_device(q);
del_timer(&q->unplug_timer);
/* unplugging should not happen automatically... */
}
spin_unlock_irq(q->queue_lock);
}
static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
{
return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
== CRYPTO_ALG_TYPE_HASH;
}
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)
#endif
#endif

View File

@ -123,7 +123,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
{
struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
unsigned long timeout;
for (timeout = 20; timeout; timeout--) {
if (!notify[3])
return 0;
udelay(10);
}
timeout = jiffies + msecs_to_jiffies(timeout_ms);
do {
if (!notify[3])

View File

@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
for (; nsect > 0; nsect--, block++, buf += tr->blksize)
if (tr->readsect(dev, block, buf))
return -EIO;
rq_flush_dcache_pages(req);
return 0;
case WRITE:
if (!tr->writesect)
return -EIO;
rq_flush_dcache_pages(req);
for (; nsect > 0; nsect--, block++, buf += tr->blksize)
if (tr->writesect(dev, block, buf))
return -EIO;

View File

@ -143,7 +143,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
struct inode *inode = mapping->host;
struct pohmelfs_inode *pi = POHMELFS_I(inode);
struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb);
struct backing_dev_info *bdi = mapping->backing_dev_info;
int err = 0;
int done = 0;
int nr_pages;
@ -152,11 +151,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c
int scanned = 0;
int range_whole = 0;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
return 0;
}
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
@ -248,10 +242,6 @@ retry:
if (wbc->nr_to_write <= 0)
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
done = 1;
}
continue;
out_continue:

View File

@ -15,6 +15,7 @@
#include <linux/aio_abi.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#define DEBUG 0
@ -32,6 +33,9 @@
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/mempool.h>
#include <linux/hash.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@ -60,6 +64,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
static DEFINE_SPINLOCK(fput_lock);
static LIST_HEAD(fput_head);
#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
struct aio_batch_entry {
struct hlist_node list;
struct address_space *mapping;
};
mempool_t *abe_pool;
static void aio_kick_handler(struct work_struct *);
static void aio_queue_work(struct kioctx *);
@ -73,6 +85,8 @@ static int __init aio_setup(void)
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
aio_wq = create_workqueue("aio");
abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
BUG_ON(!abe_pool);
pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@ -1531,8 +1545,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
return 1;
}
static void aio_batch_add(struct address_space *mapping,
struct hlist_head *batch_hash)
{
struct aio_batch_entry *abe;
struct hlist_node *pos;
unsigned bucket;
bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
if (abe->mapping == mapping)
return;
}
abe = mempool_alloc(abe_pool, GFP_KERNEL);
BUG_ON(!igrab(mapping->host));
abe->mapping = mapping;
hlist_add_head(&abe->list, &batch_hash[bucket]);
return;
}
static void aio_batch_free(struct hlist_head *batch_hash)
{
struct aio_batch_entry *abe;
struct hlist_node *pos, *n;
int i;
for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
blk_run_address_space(abe->mapping);
iput(abe->mapping->host);
hlist_del(&abe->list);
mempool_free(abe, abe_pool);
}
}
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
struct iocb *iocb)
struct iocb *iocb, struct hlist_head *batch_hash)
{
struct kiocb *req;
struct file *file;
@ -1608,6 +1658,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
;
}
spin_unlock_irq(&ctx->ctx_lock);
if (req->ki_opcode == IOCB_CMD_PREAD ||
req->ki_opcode == IOCB_CMD_PREADV ||
req->ki_opcode == IOCB_CMD_PWRITE ||
req->ki_opcode == IOCB_CMD_PWRITEV)
aio_batch_add(file->f_mapping, batch_hash);
aio_put_req(req); /* drop extra ref to req */
return 0;
@ -1635,6 +1691,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
struct kioctx *ctx;
long ret = 0;
int i;
struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
if (unlikely(nr < 0))
return -EINVAL;
@ -1666,10 +1723,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
break;
}
ret = io_submit_one(ctx, user_iocb, &tmp);
ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
if (ret)
break;
}
aio_batch_free(batch_hash);
put_ioctx(ctx);
return i ? i : ret;

View File

@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio)
}
}
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
void bio_flush_dcache_pages(struct bio *bi)
{
int i;
struct bio_vec *bvec;
bio_for_each_segment(bvec, bi, i)
flush_dcache_page(bvec->bv_page);
}
EXPORT_SYMBOL(bio_flush_dcache_pages);
#endif
/**
* bio_endio - end I/O on a bio
* @bio: bio

View File

@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
{
return sync_blockdev(I_BDEV(filp->f_mapping->host));
struct block_device *bdev = I_BDEV(filp->f_mapping->host);
int error;
error = sync_blockdev(bdev);
if (error)
return error;
error = blkdev_issue_flush(bdev, NULL);
if (error == -EOPNOTSUPP)
error = 0;
return error;
}
/*

View File

@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
if (dio->bio)
dio_bio_submit(dio);
/* All IO is now issued, send it on its way */
blk_run_address_space(inode->i_mapping);
/*
* It is possible that, we return short IO due to end of file.
* In that case, we need to release all the pages we got hold on.
@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
((rw & READ) || (dio->result == dio->size)))
ret = -EIOCBQUEUED;
if (ret != -EIOCBQUEUED)
if (ret != -EIOCBQUEUED) {
/* All IO is now issued, send it on its way */
blk_run_address_space(inode->i_mapping);
dio_await_completion(dio);
}
/*
* Sync will always be dropping the final ref and completing the
@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
int acquire_i_mutex = 0;
if (rw & WRITE)
rw = WRITE_ODIRECT;
rw = WRITE_ODIRECT_PLUG;
if (bdev)
bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));

View File

@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
struct super_block *sb = wbc->sb, *pin_sb = NULL;
const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
spin_lock(&inode_lock);
@ -635,36 +634,11 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
continue;
}
if (!bdi_cap_writeback_dirty(wb->bdi)) {
redirty_tail(inode);
if (is_blkdev_sb) {
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
*/
continue;
}
/*
* Dirty memory-backed inode against a filesystem other
* than the kernel-internal bdev filesystem. Skip the
* entire superblock.
*/
break;
}
if (inode->i_state & (I_NEW | I_WILL_FREE)) {
requeue_io(inode);
continue;
}
if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
wbc->encountered_congestion = 1;
if (!is_blkdev_sb)
break; /* Skip a congested fs */
requeue_io(inode);
continue; /* Skip a congested blockdev */
}
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@ -756,6 +730,7 @@ static long wb_writeback(struct bdi_writeback *wb,
.sync_mode = args->sync_mode,
.older_than_this = NULL,
.for_kupdate = args->for_kupdate,
.for_background = args->for_background,
.range_cyclic = args->range_cyclic,
};
unsigned long oldest_jif;
@ -787,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb,
break;
wbc.more_io = 0;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
writeback_inodes_wb(wb, &wbc);

View File

@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc)
{
if (wbc->for_reclaim)
return FLUSH_HIGHPRI | FLUSH_STABLE;
if (wbc->for_kupdate)
if (wbc->for_kupdate || wbc->for_background)
return FLUSH_LOWPRI;
return 0;
}

View File

@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev,
return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
}
ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%u\n", p->discard_alignment);
}
ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = {
&dev_attr_start.attr,
&dev_attr_size.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_discard_alignment.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
p->start_sect = start;
p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
p->discard_alignment = queue_sector_discard_alignment(disk->queue,
start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);

View File

@ -1,7 +1,9 @@
/************************************************************
* EFI GUID Partition Table handling
* Per Intel EFI Specification v1.02
* http://developer.intel.com/technology/efi/efi.htm
*
* http://www.uefi.org/specs/
* http://www.intel.com/technology/efi/
*
* efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
* Copyright 2000,2001,2002,2004 Dell Inc.
*
@ -92,6 +94,7 @@
*
************************************************************/
#include <linux/crc32.h>
#include <linux/math64.h>
#include "check.h"
#include "efi.h"
@ -141,7 +144,8 @@ last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
return (bdev->bd_inode->i_size >> 9) - 1ULL;
return div_u64(bdev->bd_inode->i_size,
bdev_logical_block_size(bdev)) - 1ULL;
}
static inline int
@ -188,6 +192,7 @@ static size_t
read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
{
size_t totalreadcount = 0;
sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
if (!bdev || !buffer || lba > last_lba(bdev))
return 0;
@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
while (count) {
int copied = 512;
Sector sect;
unsigned char *data = read_dev_sector(bdev, lba++, &sect);
unsigned char *data = read_dev_sector(bdev, n++, &sect);
if (!data)
break;
if (copied > count)
@ -257,15 +262,16 @@ static gpt_header *
alloc_read_gpt_header(struct block_device *bdev, u64 lba)
{
gpt_header *gpt;
unsigned ssz = bdev_logical_block_size(bdev);
if (!bdev)
return NULL;
gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
gpt = kzalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
if (read_lba(bdev, lba, (u8 *) gpt,
sizeof (gpt_header)) < sizeof (gpt_header)) {
if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
@ -601,6 +607,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
unsigned ssz = bdev_logical_block_size(bdev) / 512;
if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@ -611,13 +618,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
u64 start = le64_to_cpu(ptes[i].starting_lba);
u64 size = le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) + 1ULL;
if (!is_pte_valid(&ptes[i], last_lba(bdev)))
continue;
put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba),
(le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) +
1ULL));
put_partition(state, i+1, start * ssz, size * ssz);
/* If this is a RAID volume, tell md */
if (!efi_guidcmp(ptes[i].partition_type_guid,

View File

@ -37,7 +37,6 @@
#define EFI_PMBR_OSTYPE_EFI 0xEF
#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
#define GPT_BLOCK_SIZE 512
#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
#define GPT_HEADER_REVISION_V1 0x00010000
#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
@ -79,7 +78,12 @@ typedef struct _gpt_header {
__le32 num_partition_entries;
__le32 sizeof_partition_entry;
__le32 partition_entry_array_crc32;
u8 reserved2[GPT_BLOCK_SIZE - 92];
/* The rest of the logical block is reserved by UEFI and must be zero.
* EFI standard handles this by:
*
* uint8_t reserved2[ BlockSize - 92 ];
*/
} __attribute__ ((packed)) gpt_header;
typedef struct _gpt_entry_attributes {

View File

@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!(out_file->f_mode & FMODE_WRITE))
goto fput_out;
retval = -EINVAL;
if (!out_file->f_op || !out_file->f_op->sendpage)
goto fput_out;
in_inode = in_file->f_path.dentry->d_inode;
out_inode = out_file->f_path.dentry->d_inode;
retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);

View File

@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
ret = buf->ops->confirm(pipe, buf);
if (!ret) {
more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
ret = file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more);
if (file->f_op && file->f_op->sendpage)
ret = file->f_op->sendpage(file, buf->page, buf->offset,
sd->len, &pos, more);
else
ret = -EINVAL;
}
return ret;
@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
if (unlikely(ret < 0))
return ret;
splice_write = out->f_op->splice_write;
if (!splice_write)
if (out->f_op && out->f_op->splice_write)
splice_write = out->f_op->splice_write;
else
splice_write = default_file_splice_write;
return splice_write(pipe, out, ppos, len, flags);
@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
splice_read = in->f_op->splice_read;
if (!splice_read)
if (in->f_op && in->f_op->splice_read)
splice_read = in->f_op->splice_read;
else
splice_read = default_file_splice_read;
return splice_read(in, ppos, pipe, len, flags);
@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (off_in)
return -ESPIPE;
if (off_out) {
if (out->f_op->llseek == no_llseek)
if (!out->f_op || !out->f_op->llseek ||
out->f_op->llseek == no_llseek)
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
if (off_out)
return -ESPIPE;
if (off_in) {
if (in->f_op->llseek == no_llseek)
if (!in->f_op || !in->f_op->llseek ||
in->f_op->llseek == no_llseek)
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;

View File

@ -904,16 +904,9 @@ xfs_convert_page(
if (startio) {
if (count) {
struct backing_dev_info *bdi;
bdi = inode->i_mapping->backing_dev_info;
wbc->nr_to_write--;
if (bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
if (wbc->nr_to_write <= 0)
done = 1;
} else if (wbc->nr_to_write <= 0) {
done = 1;
}
}
xfs_start_page_writeback(page, !page_dirty, count);
}

View File

@ -13,6 +13,7 @@
#define flush_cache_dup_mm(mm) do { } while (0)
#define flush_cache_range(vma, start, end) do { } while (0)
#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
#define flush_dcache_page(page) do { } while (0)
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)

View File

@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word)
return 0;
}
static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
struct page *page)
{
if (bdi && bdi->unplug_io_fn)
bdi->unplug_io_fn(bdi, page);
}
static inline void blk_run_address_space(struct address_space *mapping)
{
if (mapping)
blk_run_backing_dev(mapping->backing_dev_info, NULL);
}
#endif /* _LINUX_BACKING_DEV_H */

View File

@ -391,6 +391,18 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
gfp_t, int);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
#endif
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
extern void bio_flush_dcache_pages(struct bio *bi);
#else
static inline void bio_flush_dcache_pages(struct bio *bi)
{
}
#endif
extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
unsigned long, unsigned int, int, gfp_t);
extern struct bio *bio_copy_user_iov(struct request_queue *,
@ -450,11 +462,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
/*
* remember never ever reenable interrupts between a bvec_kmap_irq and
* bvec_kunmap_irq!
*
* This function MUST be inlined - it plays with the CPU interrupt flags.
*/
static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
unsigned long *flags)
static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
{
unsigned long addr;
@ -470,8 +479,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
return (char *) addr + bvec->bv_offset;
}
static __always_inline void bvec_kunmap_irq(char *buffer,
unsigned long *flags)
static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
{
unsigned long ptr = (unsigned long) buffer & PAGE_MASK;

View File

@ -312,13 +312,17 @@ struct queue_limits {
unsigned int io_min;
unsigned int io_opt;
unsigned int max_discard_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
unsigned short logical_block_size;
unsigned short max_hw_segments;
unsigned short max_phys_segments;
unsigned char misaligned;
unsigned char discard_misaligned;
unsigned char no_cluster;
signed char discard_zeroes_data;
};
struct request_queue
@ -749,6 +753,17 @@ struct req_iterator {
#define rq_iter_last(rq, _iter) \
(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
#endif
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
extern void rq_flush_dcache_pages(struct request *rq);
#else
static inline void rq_flush_dcache_pages(struct request *rq)
{
}
#endif
extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
extern void register_disk(struct gendisk *dev);
@ -823,19 +838,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
return bdev->bd_disk->queue;
}
static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
struct page *page)
{
if (bdi && bdi->unplug_io_fn)
bdi->unplug_io_fn(bdi, page);
}
static inline void blk_run_address_space(struct address_space *mapping)
{
if (mapping)
blk_run_backing_dev(mapping->backing_dev_info, NULL);
}
/*
* blk_rq_pos() : the current sector
* blk_rq_bytes() : bytes left in the entire request
@ -1134,6 +1136,34 @@ static inline int bdev_alignment_offset(struct block_device *bdev)
return q->limits.alignment_offset;
}
static inline int queue_discard_alignment(struct request_queue *q)
{
if (q->limits.discard_misaligned)
return -1;
return q->limits.discard_alignment;
}
static inline int queue_sector_discard_alignment(struct request_queue *q,
sector_t sector)
{
return ((sector << 9) - q->limits.discard_alignment)
& (q->limits.discard_granularity - 1);
}
static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
{
if (q->limits.discard_zeroes_data == 1)
return 1;
return 0;
}
static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
{
return queue_discard_zeroes_data(bdev_get_queue(bdev));
}
static inline int queue_dma_alignment(struct request_queue *q)
{
return q ? q->dma_alignment : 511;

View File

@ -60,3 +60,9 @@ SUBSYS(net_cls)
#endif
/* */
#ifdef CONFIG_BLK_CGROUP
SUBSYS(blkio)
#endif
/* */

View File

@ -43,6 +43,8 @@
#define CN_DST_VAL 0x1
#define CN_IDX_DM 0x7 /* Device Mapper */
#define CN_VAL_DM_USERSPACE_LOG 0x1
#define CN_IDX_DRBD 0x8
#define CN_VAL_DRBD 0x1
#define CN_NETLINK_USERS 8

343
include/linux/drbd.h Normal file
View File

@ -0,0 +1,343 @@
/*
drbd.h
Kernel module for 2.6.x Kernels
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef DRBD_H
#define DRBD_H
#include <linux/connector.h>
#include <asm/types.h>
#ifdef __KERNEL__
#include <linux/types.h>
#include <asm/byteorder.h>
#else
#include <sys/types.h>
#include <sys/wait.h>
#include <limits.h>
/* Altough the Linux source code makes a difference between
generic endianness and the bitfields' endianness, there is no
architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
does not match the generic endianness. */
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define __LITTLE_ENDIAN_BITFIELD
#elif __BYTE_ORDER == __BIG_ENDIAN
#define __BIG_ENDIAN_BITFIELD
#else
# error "sorry, weird endianness on this box"
#endif
#endif
extern const char *drbd_buildtag(void);
#define REL_VERSION "8.3.6"
#define API_VERSION 88
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 91
enum drbd_io_error_p {
EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
EP_CALL_HELPER,
EP_DETACH
};
enum drbd_fencing_p {
FP_DONT_CARE,
FP_RESOURCE,
FP_STONITH
};
enum drbd_disconnect_p {
DP_RECONNECT,
DP_DROP_NET_CONF,
DP_FREEZE_IO
};
enum drbd_after_sb_p {
ASB_DISCONNECT,
ASB_DISCARD_YOUNGER_PRI,
ASB_DISCARD_OLDER_PRI,
ASB_DISCARD_ZERO_CHG,
ASB_DISCARD_LEAST_CHG,
ASB_DISCARD_LOCAL,
ASB_DISCARD_REMOTE,
ASB_CONSENSUS,
ASB_DISCARD_SECONDARY,
ASB_CALL_HELPER,
ASB_VIOLENTLY
};
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_codes {
ERR_CODE_BASE = 100,
NO_ERROR = 101,
ERR_LOCAL_ADDR = 102,
ERR_PEER_ADDR = 103,
ERR_OPEN_DISK = 104,
ERR_OPEN_MD_DISK = 105,
ERR_DISK_NOT_BDEV = 107,
ERR_MD_NOT_BDEV = 108,
ERR_DISK_TO_SMALL = 111,
ERR_MD_DISK_TO_SMALL = 112,
ERR_BDCLAIM_DISK = 114,
ERR_BDCLAIM_MD_DISK = 115,
ERR_MD_IDX_INVALID = 116,
ERR_IO_MD_DISK = 118,
ERR_MD_INVALID = 119,
ERR_AUTH_ALG = 120,
ERR_AUTH_ALG_ND = 121,
ERR_NOMEM = 122,
ERR_DISCARD = 123,
ERR_DISK_CONFIGURED = 124,
ERR_NET_CONFIGURED = 125,
ERR_MANDATORY_TAG = 126,
ERR_MINOR_INVALID = 127,
ERR_INTR = 129, /* EINTR */
ERR_RESIZE_RESYNC = 130,
ERR_NO_PRIMARY = 131,
ERR_SYNC_AFTER = 132,
ERR_SYNC_AFTER_CYCLE = 133,
ERR_PAUSE_IS_SET = 134,
ERR_PAUSE_IS_CLEAR = 135,
ERR_PACKET_NR = 137,
ERR_NO_DISK = 138,
ERR_NOT_PROTO_C = 139,
ERR_NOMEM_BITMAP = 140,
ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
ERR_DATA_NOT_CURRENT = 150,
ERR_CONNECTED = 151, /* DRBD 8.3 only */
ERR_PERM = 152,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
};
#define DRBD_PROT_A 1
#define DRBD_PROT_B 2
#define DRBD_PROT_C 3
enum drbd_role {
R_UNKNOWN = 0,
R_PRIMARY = 1, /* role */
R_SECONDARY = 2, /* role */
R_MASK = 3,
};
/* The order of these constants is important.
* The lower ones (<C_WF_REPORT_PARAMS) indicate
* that there is no socket!
* >=C_WF_REPORT_PARAMS ==> There is a socket
*/
enum drbd_conns {
C_STANDALONE,
C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
/* These temporal states are all used on the way
* from >= C_CONNECTED to Unconnected.
* The 'disconnect reason' states
* I do not allow to change beween them. */
C_TIMEOUT,
C_BROKEN_PIPE,
C_NETWORK_FAILURE,
C_PROTOCOL_ERROR,
C_TEAR_DOWN,
C_WF_CONNECTION,
C_WF_REPORT_PARAMS, /* we have a socket */
C_CONNECTED, /* we have introduced each other */
C_STARTING_SYNC_S, /* starting full sync by admin request. */
C_STARTING_SYNC_T, /* stariing full sync by admin request. */
C_WF_BITMAP_S,
C_WF_BITMAP_T,
C_WF_SYNC_UUID,
/* All SyncStates are tested with this comparison
* xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
C_SYNC_SOURCE,
C_SYNC_TARGET,
C_VERIFY_S,
C_VERIFY_T,
C_PAUSED_SYNC_S,
C_PAUSED_SYNC_T,
C_MASK = 31
};
enum drbd_disk_state {
D_DISKLESS,
D_ATTACHING, /* In the process of reading the meta-data */
D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
/* when >= D_FAILED it is legal to access mdev->bc */
D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
D_INCONSISTENT,
D_OUTDATED,
D_UNKNOWN, /* Only used for the peer, never for myself */
D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
D_MASK = 15
};
union drbd_state {
/* According to gcc's docs is the ...
* The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
* Determined by ABI.
* pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
* even though we transmit as "cpu_to_be32(state)",
* the offsets of the bitfields still need to be swapped
* on different endianess.
*/
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11; /* 0 unused */
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned conn:5 ; /* 17/32 cstates */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else
# error "this endianess is not supported"
#endif
};
unsigned int i;
};
enum drbd_state_ret_codes {
SS_CW_NO_NEED = 4,
SS_CW_SUCCESS = 3,
SS_NOTHING_TO_DO = 2,
SS_SUCCESS = 1,
SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
SS_TWO_PRIMARIES = -1,
SS_NO_UP_TO_DATE_DISK = -2,
SS_NO_LOCAL_DISK = -4,
SS_NO_REMOTE_DISK = -5,
SS_CONNECTED_OUTDATES = -6,
SS_PRIMARY_NOP = -7,
SS_RESYNC_RUNNING = -8,
SS_ALREADY_STANDALONE = -9,
SS_CW_FAILED_BY_PEER = -10,
SS_IS_DISKLESS = -11,
SS_DEVICE_IN_USE = -12,
SS_NO_NET_CONFIG = -13,
SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
SS_LOWER_THAN_OUTDATED = -16,
SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
};
/* from drbd_strings.c */
extern const char *drbd_conn_str(enum drbd_conns);
extern const char *drbd_role_str(enum drbd_role);
extern const char *drbd_disk_str(enum drbd_disk_state);
extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
#define SHARED_SECRET_MAX 64
#define MDF_CONSISTENT (1 << 0)
#define MDF_PRIMARY_IND (1 << 1)
#define MDF_CONNECTED_IND (1 << 2)
#define MDF_FULL_SYNC (1 << 3)
#define MDF_WAS_UP_TO_DATE (1 << 4)
#define MDF_PEER_OUT_DATED (1 << 5)
#define MDF_CRASHED_PRIMARY (1 << 6)
enum drbd_uuid_index {
UI_CURRENT,
UI_BITMAP,
UI_HISTORY_START,
UI_HISTORY_END,
UI_SIZE, /* nl-packet: number of dirty bits */
UI_FLAGS, /* nl-packet: flags */
UI_EXTENDED_SIZE /* Everything. */
};
enum drbd_timeout_flag {
UT_DEFAULT = 0,
UT_DEGRADED = 1,
UT_PEER_OUTDATED = 2,
};
#define UUID_JUST_CREATED ((__u64)4)
#define DRBD_MAGIC 0x83740267
#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2
#define DRBD_MD_INDEX_FLEX_INT -3
/* Start of the new netlink/connector stuff */
#define DRBD_NL_CREATE_DEVICE 0x01
#define DRBD_NL_SET_DEFAULTS 0x02
/* For searching a vacant cn_idx value */
#define CN_IDX_STEP 6977
struct drbd_nl_cfg_req {
int packet_type;
unsigned int drbd_minor;
int flags;
unsigned short tag_list[];
};
struct drbd_nl_cfg_reply {
int packet_type;
unsigned int minor;
int ret_code; /* enum ret_code or set_st_err_t */
unsigned short tag_list[]; /* only used with get_* calls */
};
#endif

137
include/linux/drbd_limits.h Normal file
View File

@ -0,0 +1,137 @@
/*
drbd_limits.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
*/
/*
* Our current limitations.
* Some of them are hard limits,
* some of them are arbitrary range limits, that make it easier to provide
* feedback about nonsense settings for certain configurable values.
*/
#ifndef DRBD_LIMITS_H
#define DRBD_LIMITS_H 1
#define DEBUG_RANGE_CHECK 0
#define DRBD_MINOR_COUNT_MIN 1
#define DRBD_MINOR_COUNT_MAX 255
#define DRBD_DIALOG_REFRESH_MIN 0
#define DRBD_DIALOG_REFRESH_MAX 600
/* valid port number */
#define DRBD_PORT_MIN 1
#define DRBD_PORT_MAX 0xffff
/* startup { */
/* if you want more than 3.4 days, disable */
#define DRBD_WFC_TIMEOUT_MIN 0
#define DRBD_WFC_TIMEOUT_MAX 300000
#define DRBD_WFC_TIMEOUT_DEF 0
#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
/* }*/
/* net { */
/* timeout, unit centi seconds
* more than one minute timeout is not usefull */
#define DRBD_TIMEOUT_MIN 1
#define DRBD_TIMEOUT_MAX 600
#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
/* active connection retries when C_WF_CONNECTION */
#define DRBD_CONNECT_INT_MIN 1
#define DRBD_CONNECT_INT_MAX 120
#define DRBD_CONNECT_INT_DEF 10 /* seconds */
/* keep-alive probes when idle */
#define DRBD_PING_INT_MIN 1
#define DRBD_PING_INT_MAX 120
#define DRBD_PING_INT_DEF 10
/* timeout for the ping packets.*/
#define DRBD_PING_TIMEO_MIN 1
#define DRBD_PING_TIMEO_MAX 100
#define DRBD_PING_TIMEO_DEF 5
/* max number of write requests between write barriers */
#define DRBD_MAX_EPOCH_SIZE_MIN 1
#define DRBD_MAX_EPOCH_SIZE_MAX 20000
#define DRBD_MAX_EPOCH_SIZE_DEF 2048
/* I don't think that a tcp send buffer of more than 10M is usefull */
#define DRBD_SNDBUF_SIZE_MIN 0
#define DRBD_SNDBUF_SIZE_MAX (10<<20)
#define DRBD_SNDBUF_SIZE_DEF 0
#define DRBD_RCVBUF_SIZE_MIN 0
#define DRBD_RCVBUF_SIZE_MAX (10<<20)
#define DRBD_RCVBUF_SIZE_DEF 0
/* @4k PageSize -> 128kB - 512MB */
#define DRBD_MAX_BUFFERS_MIN 32
#define DRBD_MAX_BUFFERS_MAX 131072
#define DRBD_MAX_BUFFERS_DEF 2048
/* @4k PageSize -> 4kB - 512MB */
#define DRBD_UNPLUG_WATERMARK_MIN 1
#define DRBD_UNPLUG_WATERMARK_MAX 131072
#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
/* 0 is disabled.
* 200 should be more than enough even for very short timeouts */
#define DRBD_KO_COUNT_MIN 0
#define DRBD_KO_COUNT_MAX 200
#define DRBD_KO_COUNT_DEF 0
/* } */
/* syncer { */
/* FIXME allow rate to be zero? */
#define DRBD_RATE_MIN 1
/* channel bonding 10 GbE, or other hardware */
#define DRBD_RATE_MAX (4 << 20)
#define DRBD_RATE_DEF 250 /* kb/second */
/* less than 7 would hit performance unneccessarily.
* 3833 is the largest prime that still does fit
* into 64 sectors of activity log */
#define DRBD_AL_EXTENTS_MIN 7
#define DRBD_AL_EXTENTS_MAX 3833
#define DRBD_AL_EXTENTS_DEF 127
#define DRBD_AFTER_MIN -1
#define DRBD_AFTER_MAX 255
#define DRBD_AFTER_DEF -1
/* } */
/* drbdsetup XY resize -d Z
* you are free to reduce the device size to nothing, if you want to.
* the upper limit with 64bit kernel, enough ram and flexible meta data
* is 16 TB, currently. */
/* DRBD_MAX_SECTORS */
#define DRBD_DISK_SIZE_SECT_MIN 0
#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
#define DRBD_FENCING_DEF FP_DONT_CARE
#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
#define DRBD_MAX_BIO_BVECS_MIN 0
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
#undef RANGE
#endif

137
include/linux/drbd_nl.h Normal file
View File

@ -0,0 +1,137 @@
/*
PAKET( name,
TYPE ( pn, pr, member )
...
)
You may never reissue one of the pn arguments
*/
#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
#endif
NL_PACKET(primary, 1,
NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
)
NL_PACKET(secondary, 2, )
NL_PACKET(disk_conf, 3,
NL_INT64( 2, T_MAY_IGNORE, disk_size)
NL_STRING( 3, T_MANDATORY, backing_dev, 128)
NL_STRING( 4, T_MANDATORY, meta_dev, 128)
NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
NL_INTEGER( 7, T_MAY_IGNORE, fencing)
NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
/* 55 max_bio_size was available in 8.2.6rc2 */
NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
)
NL_PACKET(detach, 4, )
NL_PACKET(net_conf, 5,
NL_STRING( 8, T_MANDATORY, my_addr, 128)
NL_STRING( 9, T_MANDATORY, peer_addr, 128)
NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
NL_INTEGER( 14, T_MAY_IGNORE, timeout)
NL_INTEGER( 15, T_MANDATORY, wire_protocol)
NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
/* 59 addr_family was available in GIT, never released */
NL_BIT( 60, T_MANDATORY, mind_af)
NL_BIT( 27, T_MAY_IGNORE, want_lose)
NL_BIT( 28, T_MAY_IGNORE, two_primaries)
NL_BIT( 41, T_MAY_IGNORE, always_asbp)
NL_BIT( 61, T_MAY_IGNORE, no_cork)
NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
)
NL_PACKET(disconnect, 6, )
NL_PACKET(resize, 7,
NL_INT64( 29, T_MAY_IGNORE, resize_size)
)
NL_PACKET(syncer_conf, 8,
NL_INTEGER( 30, T_MAY_IGNORE, rate)
NL_INTEGER( 31, T_MAY_IGNORE, after)
NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
NL_BIT( 65, T_MAY_IGNORE, use_rle)
)
NL_PACKET(invalidate, 9, )
NL_PACKET(invalidate_peer, 10, )
NL_PACKET(pause_sync, 11, )
NL_PACKET(resume_sync, 12, )
NL_PACKET(suspend_io, 13, )
NL_PACKET(resume_io, 14, )
NL_PACKET(outdate, 15, )
NL_PACKET(get_config, 16, )
NL_PACKET(get_state, 17,
NL_INTEGER( 33, T_MAY_IGNORE, state_i)
)
NL_PACKET(get_uuids, 18,
NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
)
NL_PACKET(get_timeout_flag, 19,
NL_BIT( 36, T_MAY_IGNORE, use_degraded)
)
NL_PACKET(call_helper, 20,
NL_STRING( 38, T_MAY_IGNORE, helper, 32)
)
/* Tag nr 42 already allocated in drbd-8.1 development. */
NL_PACKET(sync_progress, 23,
NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
)
NL_PACKET(dump_ee, 24,
NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
NL_INT64( 48, T_MAY_IGNORE, ee_sector)
NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
)
NL_PACKET(start_ov, 25,
NL_INT64( 66, T_MAY_IGNORE, start_sector)
)
NL_PACKET(new_c_uuid, 26,
NL_BIT( 63, T_MANDATORY, clear_bm)
)
#undef NL_PACKET
#undef NL_INTEGER
#undef NL_INT64
#undef NL_BIT
#undef NL_STRING

View File

@ -0,0 +1,83 @@
#ifndef DRBD_TAG_MAGIC_H
#define DRBD_TAG_MAGIC_H
#define TT_END 0
#define TT_REMOVED 0xE000
/* declare packet_type enums */
enum packet_types {
#define NL_PACKET(name, number, fields) P_ ## name = number,
#define NL_INTEGER(pn, pr, member)
#define NL_INT64(pn, pr, member)
#define NL_BIT(pn, pr, member)
#define NL_STRING(pn, pr, member, len)
#include "drbd_nl.h"
P_nl_after_last_packet,
};
/* These struct are used to deduce the size of the tag lists: */
#define NL_PACKET(name, number, fields) \
struct name ## _tag_len_struct { fields };
#define NL_INTEGER(pn, pr, member) \
int member; int tag_and_len ## member;
#define NL_INT64(pn, pr, member) \
__u64 member; int tag_and_len ## member;
#define NL_BIT(pn, pr, member) \
unsigned char member:1; int tag_and_len ## member;
#define NL_STRING(pn, pr, member, len) \
unsigned char member[len]; int member ## _len; \
int tag_and_len ## member;
#include "linux/drbd_nl.h"
/* declate tag-list-sizes */
static const int tag_list_sizes[] = {
#define NL_PACKET(name, number, fields) 2 fields ,
#define NL_INTEGER(pn, pr, member) + 4 + 4
#define NL_INT64(pn, pr, member) + 4 + 8
#define NL_BIT(pn, pr, member) + 4 + 1
#define NL_STRING(pn, pr, member, len) + 4 + (len)
#include "drbd_nl.h"
};
/* The two highest bits are used for the tag type */
#define TT_MASK 0xC000
#define TT_INTEGER 0x0000
#define TT_INT64 0x4000
#define TT_BIT 0x8000
#define TT_STRING 0xC000
/* The next bit indicates if processing of the tag is mandatory */
#define T_MANDATORY 0x2000
#define T_MAY_IGNORE 0x0000
#define TN_MASK 0x1fff
/* The remaining 13 bits are used to enumerate the tags */
#define tag_type(T) ((T) & TT_MASK)
#define tag_number(T) ((T) & TN_MASK)
/* declare tag enums */
#define NL_PACKET(name, number, fields) fields
enum drbd_tags {
#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
#include "drbd_nl.h"
};
struct tag {
const char *name;
int type_n_flags;
int max_len;
};
/* declare tag names */
#define NL_PACKET(name, number, fields) fields
static const struct tag tag_descriptions[] = {
#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
#include "drbd_nl.h"
};
#endif

View File

@ -129,7 +129,7 @@ struct inodes_stat_t {
* WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device
* immediately after submission. The write equivalent
* of READ_SYNC.
* WRITE_ODIRECT Special case write for O_DIRECT only.
* WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
* SWRITE_SYNC
* SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
* See SWRITE.
@ -151,7 +151,7 @@ struct inodes_stat_t {
#define READ_META (READ | (1 << BIO_RW_META))
#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO))
#define SWRITE_SYNC_PLUG \
(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
@ -304,6 +304,7 @@ struct inodes_stat_t {
#define BLKIOOPT _IO(0x12,121)
#define BLKALIGNOFF _IO(0x12,122)
#define BLKPBSZGET _IO(0x12,123)
#define BLKDISCARDZEROES _IO(0x12,124)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */

View File

@ -91,6 +91,7 @@ struct hd_struct {
sector_t start_sect;
sector_t nr_sects;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev;
struct kobject *holder_dir;
int policy, partno;

View File

@ -40,16 +40,11 @@ struct cfq_io_context {
struct io_context *ioc;
unsigned long last_end_request;
sector_t last_request_pos;
unsigned long ttime_total;
unsigned long ttime_samples;
unsigned long ttime_mean;
unsigned int seek_samples;
u64 seek_total;
sector_t seek_mean;
struct list_head queue_list;
struct hlist_node cic_list;
@ -73,6 +68,10 @@ struct io_context {
unsigned short ioprio;
unsigned short ioprio_changed;
#ifdef CONFIG_BLK_CGROUP
unsigned short cgroup_changed;
#endif
/*
* For request batching
*/
@ -99,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
return NULL;
}
struct task_struct;
#ifdef CONFIG_BLOCK
int put_io_context(struct io_context *ioc);
void exit_io_context(void);
void exit_io_context(struct task_struct *task);
struct io_context *get_io_context(gfp_t gfp_flags, int node);
struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
void copy_io_context(struct io_context **pdst, struct io_context **psrc);
#else
static inline void exit_io_context(void)
static inline void exit_io_context(struct task_struct *task)
{
}

294
include/linux/lru_cache.h Normal file
View File

@ -0,0 +1,294 @@
/*
lru_cache.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef LRU_CACHE_H
#define LRU_CACHE_H
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/string.h> /* for memset */
#include <linux/seq_file.h>
/*
This header file (and its .c file; kernel-doc of functions see there)
define a helper framework to easily keep track of index:label associations,
and changes to an "active set" of objects, as well as pending transactions,
to persistently record those changes.
We use an LRU policy if it is necessary to "cool down" a region currently in
the active set before we can "heat" a previously unused region.
Because of this later property, it is called "lru_cache".
As it actually Tracks Objects in an Active SeT, we could also call it
toast (incidentally that is what may happen to the data on the
backend storage uppon next resync, if we don't get it right).
What for?
We replicate IO (more or less synchronously) to local and remote disk.
For crash recovery after replication node failure,
we need to resync all regions that have been target of in-flight WRITE IO
(in use, or "hot", regions), as we don't know wether or not those WRITEs have
made it to stable storage.
To avoid a "full resync", we need to persistently track these regions.
This is known as "write intent log", and can be implemented as on-disk
(coarse or fine grained) bitmap, or other meta data.
To avoid the overhead of frequent extra writes to this meta data area,
usually the condition is softened to regions that _may_ have been target of
in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
bitmap, trading frequency of meta data transactions against amount of
(possibly unneccessary) resync traffic.
If we set a hard limit on the area that may be "hot" at any given time, we
limit the amount of resync traffic needed for crash recovery.
For recovery after replication link failure,
we need to resync all blocks that have been changed on the other replica
in the mean time, or, if both replica have been changed independently [*],
all blocks that have been changed on either replica in the mean time.
[*] usually as a result of a cluster split-brain and insufficient protection.
but there are valid use cases to do this on purpose.
Tracking those blocks can be implemented as "dirty bitmap".
Having it fine-grained reduces the amount of resync traffic.
It should also be persistent, to allow for reboots (or crashes)
while the replication link is down.
There are various possible implementations for persistently storing
write intent log information, three of which are mentioned here.
"Chunk dirtying"
The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
To reduce the frequency of bitmap updates for write-intent log purposes,
one could dirty "chunks" (of some size) at a time of the (fine grained)
on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
possible, flushing it to disk again when a previously "hot" (and on-disk
dirtied as full chunk) area "cools down" again (no IO in flight anymore,
and none expected in the near future either).
"Explicit (coarse) write intent bitmap"
An other implementation could chose a (probably coarse) explicit bitmap,
for write-intent log purposes, additionally to the fine grained dirty bitmap.
"Activity log"
Yet an other implementation may keep track of the hot regions, by starting
with an empty set, and writing down a journal of region numbers that have
become "hot", or have "cooled down" again.
To be able to use a ring buffer for this journal of changes to the active
set, we not only record the actual changes to that set, but also record the
not changing members of the set in a round robin fashion. To do so, we use a
fixed (but configurable) number of slots which we can identify by index, and
associate region numbers (labels) with these indices.
For each transaction recording a change to the active set, we record the
change itself (index: -old_label, +new_label), and which index is associated
with which label (index: current_label) within a certain sliding window that
is moved further over the available indices with each such transaction.
Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
accurately reconstruct the active set.
Sufficiently large depends only on maximum number of active objects, and the
size of the sliding window recording "index: current_label" associations within
each transaction.
This is what we call the "activity log".
Currently we need one activity log transaction per single label change, which
does not give much benefit over the "dirty chunks of bitmap" approach, other
than potentially less seeks.
We plan to change the transaction format to support multiple changes per
transaction, which then would reduce several (disjoint, "random") updates to
the bitmap into one transaction to the activity log ring buffer.
*/
/* this defines an element in a tracked set
* .colision is for hash table lookup.
* When we process a new IO request, we know its sector, thus can deduce the
* region number (label) easily. To do the label -> object lookup without a
* full list walk, we use a simple hash table.
*
* .list is on one of three lists:
* in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
* lru: unused but ready to be reused or recycled
* (ts_refcnt == 0, lc_number != LC_FREE),
* free: unused but ready to be recycled
* (ts_refcnt == 0, lc_number == LC_FREE),
*
* an element is said to be "in the active set",
* if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
*
* DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
* (total memory usage 2 pages), and up to 3833 elements on the act_log
* lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
*
* We usually do not actually free these objects again, but only "recycle"
* them, as the change "index: -old_label, +LC_FREE" would need a transaction
* as well. Which also means that using a kmem_cache to allocate the objects
* from wastes some resources.
* But it avoids high order page allocations in kmalloc.
*/
struct lc_element {
struct hlist_node colision;
struct list_head list; /* LRU list or free list */
unsigned refcnt;
/* back "pointer" into ts_cache->element[index],
* for paranoia, and for "ts_element_to_index" */
unsigned lc_index;
/* if we want to track a larger set of objects,
* it needs to become arch independend u64 */
unsigned lc_number;
/* special label when on free list */
#define LC_FREE (~0U)
};
struct lru_cache {
/* the least recently used item is kept at lru->prev */
struct list_head lru;
struct list_head free;
struct list_head in_use;
/* the pre-created kmem cache to allocate the objects from */
struct kmem_cache *lc_cache;
/* size of tracked objects, used to memset(,0,) them in lc_reset */
size_t element_size;
/* offset of struct lc_element member in the tracked object */
size_t element_off;
/* number of elements (indices) */
unsigned int nr_elements;
/* Arbitrary limit on maximum tracked objects. Practical limit is much
* lower due to allocation failures, probably. For typical use cases,
* nr_elements should be a few thousand at most.
* This also limits the maximum value of ts_element.ts_index, allowing the
* 8 high bits of .ts_index to be overloaded with flags in the future. */
#define LC_MAX_ACTIVE (1<<24)
/* statistics */
unsigned used; /* number of lelements currently on in_use list */
unsigned long hits, misses, starving, dirty, changed;
/* see below: flag-bits for lru_cache */
unsigned long flags;
/* when changing the label of an index element */
unsigned int new_number;
/* for paranoia when changing the label of an index element */
struct lc_element *changing_element;
void *lc_private;
const char *name;
/* nr_elements there */
struct hlist_head *lc_slot;
struct lc_element **lc_element;
};
/* flag-bits for lru_cache */
enum {
/* debugging aid, to catch concurrent access early.
* user needs to guarantee exclusive access by proper locking! */
__LC_PARANOIA,
/* if we need to change the set, but currently there is a changing
* transaction pending, we are "dirty", and must deferr further
* changing requests */
__LC_DIRTY,
/* if we need to change the set, but currently there is no free nor
* unused element available, we are "starving", and must not give out
* further references, to guarantee that eventually some refcnt will
* drop to zero and we will be able to make progress again, changing
* the set, writing the transaction.
* if the statistics say we are frequently starving,
* nr_elements is too small. */
__LC_STARVING,
};
#define LC_PARANOIA (1<<__LC_PARANOIA)
#define LC_DIRTY (1<<__LC_DIRTY)
#define LC_STARVING (1<<__LC_STARVING)
extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned e_count, size_t e_size, size_t e_off);
extern void lc_reset(struct lru_cache *lc);
extern void lc_destroy(struct lru_cache *lc);
extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
extern void lc_del(struct lru_cache *lc, struct lc_element *element);
extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
struct seq_file;
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
void (*detail) (struct seq_file *, struct lc_element *));
/**
* lc_try_lock - can be used to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Note that the reference counts and order on the active and lru lists may
* still change. Returns true if we aquired the lock.
*/
static inline int lc_try_lock(struct lru_cache *lc)
{
return !test_and_set_bit(__LC_DIRTY, &lc->flags);
}
/**
* lc_unlock - unlock @lc, allow lc_get() to change the set again
* @lc: the lru cache to operate on
*/
static inline void lc_unlock(struct lru_cache *lc)
{
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
}
static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e = lc_find(lc, enr);
return e && e->refcnt;
}
#define lc_entry(ptr, type, member) \
container_of(ptr, type, member)
extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
#endif

View File

@ -49,6 +49,7 @@ struct writeback_control {
unsigned nonblocking:1; /* Don't get stuck on request queues */
unsigned encountered_congestion:1; /* An output: a queue is full */
unsigned for_kupdate:1; /* A kupdate writeback */
unsigned for_background:1; /* A background writeback */
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */

Some files were not shown because too many files have changed in this diff Show More