Re: [PATCH 04/17] xen: Add essential and required interface headers

3 Jul 2020

Hi Peng,
On Thu, 2020-07-02 at 01:30 +0000, Peng Fan wrote:
...
...
Subject: [PATCH 04/17] xen: Add essential and required interface
headers
From: Oleksandr Andrushchenko oleksandr_andrushchenko@epam.com
Add essential and required Xen interface headers only taken from
the stable Linux kernel stable/linux-5.7.y at commit
66dfe45221605e11f38a0bf5eb2ee808cea7cfe7.
Please use commit <12+> ("commit header")
Ok, will fix it in the next version.
...
...
These are better suited for U-boot than the original headers
from Xen as they are the stripped versions of the same.
At the same time use public protocols from Xen RELEASE-4.13.1, at
commit 6278553325a9f76d37811923221b21db3882e017
Please use commit <12+> ("commit header")
Ok, will fix it in the next version.
...
Then:
Acked-by: Peng Fan peng.fan@nxp.com
Regards,
Anastasiia
...
...
as those have more comments in them.
Signed-off-by: Oleksandr Andrushchenko
oleksandr_andrushchenko@epam.com
Signed-off-by: Anastasiia Lukianenko <
anastasiia_lukianenko@epam.com>

include/xen/arm/interface.h           |  88 ++++
 include/xen/interface/event_channel.h | 281 ++++++++++
 include/xen/interface/grant_table.h   | 582 +++++++++++++++++++++
 include/xen/interface/hvm/hvm_op.h    |  69 +++
 include/xen/interface/hvm/params.h    | 127 +++++
 include/xen/interface/io/blkif.h      | 726
++++++++++++++++++++++++++
 include/xen/interface/io/console.h    |  56 ++
 include/xen/interface/io/protocols.h  |  42 ++
 include/xen/interface/io/ring.h       | 479 +++++++++++++++++
 include/xen/interface/io/xenbus.h     |  81 +++
 include/xen/interface/io/xs_wire.h    | 151 ++++++
 include/xen/interface/memory.h        | 332 ++++++++++++
 include/xen/interface/sched.h         | 188 +++++++
 include/xen/interface/xen.h           | 225 ++++++++
 14 files changed, 3427 insertions(+)
 create mode 100644 include/xen/arm/interface.h
 create mode 100644 include/xen/interface/event_channel.h
 create mode 100644 include/xen/interface/grant_table.h
 create mode 100644 include/xen/interface/hvm/hvm_op.h
 create mode 100644 include/xen/interface/hvm/params.h
 create mode 100644 include/xen/interface/io/blkif.h
 create mode 100644 include/xen/interface/io/console.h
 create mode 100644 include/xen/interface/io/protocols.h
 create mode 100644 include/xen/interface/io/ring.h
 create mode 100644 include/xen/interface/io/xenbus.h
 create mode 100644 include/xen/interface/io/xs_wire.h
 create mode 100644 include/xen/interface/memory.h
 create mode 100644 include/xen/interface/sched.h
 create mode 100644 include/xen/interface/xen.h

diff --git a/include/xen/arm/interface.h
b/include/xen/arm/interface.h
new file mode 100644
index 0000000000..79d5ae8563
--- /dev/null
+++ b/include/xen/arm/interface.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/************************************************************



Guest OS interface to ARM Xen.







Stefano Stabellini stefano.stabellini@eu.citrix.com, Citrix,



2012

*/


+#ifndef _ASM_ARM_XEN_INTERFACE_H
+#define _ASM_ARM_XEN_INTERFACE_H



+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif



+#define uint64_aligned_t u64 __attribute__((aligned(8)))



+#define __DEFINE_GUEST_HANDLE(name, type) \

typedef struct { union { type *p; uint64_aligned_t q; }; }  \
__guest_handle_ ## name




+#define DEFINE_GUEST_HANDLE_STRUCT(name) \

__DEFINE_GUEST_HANDLE(name, struct name)

+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name,
name)
+#define GUEST_HANDLE(name)        __guest_handle_ ## name



+#define set_xen_guest_handle(hnd, val)			\

do {						\
if (sizeof(hnd) == 8)			\


	*(u64 *)&(hnd) = 0;	\


(hnd).p = val;				\


} while (0)


+#define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op



+#ifndef __ASSEMBLY__
+/* Explicitly size integers that represent pfns in the interface
with


Xen so that we can have one ABI that works for 32 and 64 bit



guests.


Note that this means that the xen_pfn_t type may be capable of



representing pfn's which the guest cannot represent in its own



pfn


type. However since pfn space is controlled by the guest this



is


fine since it simply wouldn't be able to create any sure pfns



in


the first place.


*/

+typedef u64 xen_pfn_t;
+#define PRI_xen_pfn "llx"
+typedef u64 xen_ulong_t;
+#define PRI_xen_ulong "llx"
+typedef s64 xen_long_t;
+#define PRI_xen_long "llx"
+/* Guest handles for primitive C types. */
+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
+__DEFINE_GUEST_HANDLE(uint,  unsigned int);
+DEFINE_GUEST_HANDLE(char);
+DEFINE_GUEST_HANDLE(int);
+DEFINE_GUEST_HANDLE(void);
+DEFINE_GUEST_HANDLE(u64);
+DEFINE_GUEST_HANDLE(u32);
+DEFINE_GUEST_HANDLE(xen_pfn_t);
+DEFINE_GUEST_HANDLE(xen_ulong_t);



+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 1



+struct arch_vcpu_info { };
+struct arch_shared_info { };



+/* TODO: Move pvclock definitions some place arch independent */
+struct pvclock_vcpu_time_info {

u32   version;
u32   pad0;
u64   tsc_timestamp;
u64   system_time;
u32   tsc_to_system_mul;
s8    tsc_shift;
u8    flags;
u8    pad[2];

+} __attribute__((__packed__)); /* 32 bytes */



+/* It is OK to have a 12 bytes struct with no padding because it
is packed */
+struct pvclock_wall_clock {

u32   version;
u32   sec;
u32   nsec;
u32   sec_hi;

+} __attribute__((__packed__));
+#endif



+#endif /* _ASM_ARM_XEN_INTERFACE_H */
diff --git a/include/xen/interface/event_channel.h
b/include/xen/interface/event_channel.h
new file mode 100644
index 0000000000..8174999c2f
--- /dev/null
+++ b/include/xen/interface/event_channel.h
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/************************************************************



event_channel.h







Event channels between domains.







Copyright (c) 2003-2004, K A Fraser.


*/


+#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
+#define __XEN_PUBLIC_EVENT_CHANNEL_H__



+#include <xen/interface/xen.h>



+typedef u32 evtchn_port_t;
+DEFINE_GUEST_HANDLE(evtchn_port_t);



+/*


EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and



mark as


accepting interdomain bindings from domain <remote_dom>. A



fresh port


is allocated in <dom> and returned as <port>.



NOTES:




If the caller is unprivileged then <dom> must be





DOMID_SELF.



<rdom> may be DOMID_SELF, allowing loopback connections.




*/

+#define EVTCHNOP_alloc_unbound	  6
+struct evtchn_alloc_unbound {

/* IN parameters */
domid_t dom, remote_dom;
/* OUT parameters */
evtchn_port_t port;

+};



+/*


EVTCHNOP_bind_interdomain: Construct an interdomain event



channel
between


the calling domain and <remote_dom>. <remote_dom,remote_port>



must
identify


a port that is unbound and marked as accepting bindings from



the calling


domain. A fresh port is allocated in the calling domain and



returned as


<local_port>.



NOTES:




<remote_dom> may be DOMID_SELF, allowing loopback





connections.

*/

+#define EVTCHNOP_bind_interdomain 0
+struct evtchn_bind_interdomain {

/* IN parameters. */
domid_t remote_dom;
evtchn_port_t remote_port;
/* OUT parameters. */
evtchn_port_t local_port;

+};



+/*


EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on



specified


vcpu.



NOTES:




A virtual IRQ may be bound to at most one event channel per





vcpu.



The allocated event channel is bound to the specified vcpu.





The
binding


may not be changed.




*/

+#define EVTCHNOP_bind_virq	  1
+struct evtchn_bind_virq {

/* IN parameters. */
u32 virq;
u32 vcpu;
/* OUT parameters. */
evtchn_port_t port;

+};



+/*


EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.



NOTES:




A physical IRQ may be bound to at most one event channel





per
domain.



Only a sufficiently-privileged domain may bind to a





physical IRQ.

*/

+#define EVTCHNOP_bind_pirq	  2
+struct evtchn_bind_pirq {

/* IN parameters. */
u32 pirq;

+#define BIND_PIRQ__WILL_SHARE 1

u32 flags; /* BIND_PIRQ__* */
/* OUT parameters. */
evtchn_port_t port;

+};



+/*


EVTCHNOP_bind_ipi: Bind a local event channel to receive



events.


NOTES:




The allocated event channel is bound to the specified vcpu.





The
binding


may not be changed.




*/

+#define EVTCHNOP_bind_ipi	  7
+struct evtchn_bind_ipi {

u32 vcpu;
/* OUT parameters. */
evtchn_port_t port;

+};



+/*


EVTCHNOP_close: Close a local event channel <port>. If the



channel is


interdomain then the remote end is placed in the unbound state



(EVTCHNSTAT_unbound), awaiting a new connection.


*/

+#define EVTCHNOP_close		  3
+struct evtchn_close {

/* IN parameters. */
evtchn_port_t port;

+};



+/*


EVTCHNOP_send: Send an event to the remote end of the channel



whose
local


endpoint is <port>.


*/

+#define EVTCHNOP_send		  4
+struct evtchn_send {

/* IN parameters. */
evtchn_port_t port;

+};



+/*


EVTCHNOP_status: Get the current status of the communication



channel
which


has an endpoint at <dom, port>.



NOTES:




<dom> may be specified as DOMID_SELF.






Only a sufficiently-privileged domain may obtain the status





of an
event


channel for which <dom> is not DOMID_SELF.




*/

+#define EVTCHNOP_status		  5
+struct evtchn_status {

/* IN parameters */
domid_t  dom;
evtchn_port_t port;
/* OUT parameters */

+#define EVTCHNSTAT_closed	0  /* Channel is not in use.		
     */
+#define EVTCHNSTAT_unbound	1  /* Channel is waiting interdom
connection.*/
+#define EVTCHNSTAT_interdomain	2  /* Channel is connected to
remote
domain. */
+#define EVTCHNSTAT_pirq		3  /* Channel is bound to a
phys IRQ line.
*/
+#define EVTCHNSTAT_virq		4  /* Channel is bound to a
virtual IRQ line
*/
+#define EVTCHNSTAT_ipi		5  /* Channel is bound to a
virtual IPI line
*/

u32 status;
u32 vcpu;		   /* VCPU to which this channel is

bound.   */

union {
struct {


	domid_t dom;


} unbound; /* EVTCHNSTAT_unbound */


struct {


	domid_t dom;


	evtchn_port_t port;


} interdomain; /* EVTCHNSTAT_interdomain */


u32 pirq;	    /* EVTCHNSTAT_pirq	      */


u32 virq;	    /* EVTCHNSTAT_virq	      */


} u;

+};



+/*


EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify



when
an


event is pending.



NOTES:




IPI- and VIRQ-bound channels always notify the vcpu that





initialised


the binding. This binding cannot be changed.






All other channels notify vcpu0 by default. This default is





set when


the channel is allocated (a port that is freed and





subsequently reused


has its binding reset to vcpu0).




*/

+#define EVTCHNOP_bind_vcpu	  8
+struct evtchn_bind_vcpu {

/* IN parameters. */
evtchn_port_t port;
u32 vcpu;

+};



+/*


EVTCHNOP_unmask: Unmask the specified local event-channel port



and
deliver


a notification to the appropriate VCPU if an event is pending.


*/

+#define EVTCHNOP_unmask		  9
+struct evtchn_unmask {

/* IN parameters. */
evtchn_port_t port;

+};



+/*


EVTCHNOP_reset: Close all event channels associated with



specified
domain.


NOTES:




<dom> may be specified as DOMID_SELF.






Only a sufficiently-privileged domain may specify other





than
DOMID_SELF.

*/

+#define EVTCHNOP_reset		 10
+struct evtchn_reset {

/* IN parameters. */
domid_t dom;

+};



+typedef struct evtchn_reset evtchn_reset_t;



+/*


EVTCHNOP_init_control: initialize the control block for the



FIFO ABI.

*/

+#define EVTCHNOP_init_control    11
+struct evtchn_init_control {

/* IN parameters. */
u64 control_gfn;
u32 offset;
u32 vcpu;
/* OUT parameters. */
u8 link_bits;
u8 _pad[7];

+};



+/*


EVTCHNOP_expand_array: add an additional page to the event



array.

*/

+#define EVTCHNOP_expand_array    12
+struct evtchn_expand_array {

/* IN parameters. */
u64 array_gfn;

+};



+/*


EVTCHNOP_set_priority: set the priority for an event channel.


*/

+#define EVTCHNOP_set_priority    13
+struct evtchn_set_priority {

/* IN parameters. */
evtchn_port_t port;
u32 priority;

+};



+struct evtchn_op {

u32 cmd; /* EVTCHNOP_* */
union {
struct evtchn_alloc_unbound    alloc_unbound;


struct evtchn_bind_interdomain bind_interdomain;


struct evtchn_bind_virq	       bind_virq;


struct evtchn_bind_pirq	       bind_pirq;


struct evtchn_bind_ipi	       bind_ipi;


struct evtchn_close	       close;


struct evtchn_send	       send;


struct evtchn_status	       status;


struct evtchn_bind_vcpu	       bind_vcpu;


struct evtchn_unmask	       unmask;


} u;

+};



+DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);



+/*


2-level ABI


*/


+#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) *
sizeof(xen_ulong_t) * 64)



+/*


FIFO ABI


*/


+/* Events may have priorities from 0 (highest) to 15 (lowest). */
+#define EVTCHN_FIFO_PRIORITY_MAX     0
+#define EVTCHN_FIFO_PRIORITY_DEFAULT 7
+#define EVTCHN_FIFO_PRIORITY_MIN     15



+#define EVTCHN_FIFO_MAX_QUEUES (EVTCHN_FIFO_PRIORITY_MIN + 1)



+typedef u32 event_word_t;



+#define EVTCHN_FIFO_PENDING 31
+#define EVTCHN_FIFO_MASKED  30
+#define EVTCHN_FIFO_LINKED  29
+#define EVTCHN_FIFO_BUSY    28



+#define EVTCHN_FIFO_LINK_BITS 17
+#define EVTCHN_FIFO_LINK_MASK ((1 << EVTCHN_FIFO_LINK_BITS) - 1)



+#define EVTCHN_FIFO_NR_CHANNELS (1 << EVTCHN_FIFO_LINK_BITS)



+struct evtchn_fifo_control_block {

u32     ready;
u32     _rsvd;
event_word_t head[EVTCHN_FIFO_MAX_QUEUES];

+};



+#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --git a/include/xen/interface/grant_table.h
b/include/xen/interface/grant_table.h
new file mode 100644
index 0000000000..197a0d0d58
--- /dev/null
+++ b/include/xen/interface/grant_table.h
@@ -0,0 +1,582 @@
+/************************************************************



grant_table.h







Interface for granting foreign access to page frames, and



receiving


page-ownership transfers.







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (c) 2004, K A Fraser


*/


+#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
+#define __XEN_PUBLIC_GRANT_TABLE_H__



+#include <xen/interface/xen.h>



+/***********************************


GRANT TABLE REPRESENTATION


*/


+/* Some rough guidelines on accessing and updating grant-table
entries


in a concurrency-safe manner. For more information, Linux



contains a


reference implementation for guest OSes



(arch/xen/kernel/grant_table.c).






NB. WMB is a no-op on current-generation x86 processors.



However, a


compiler barrier will still be required.









Introducing a valid entry into the grant table:




Write ent->domid.






Write ent->frame:





 GTF_permit_access:   Frame to which access is permitted.





 GTF_accept_transfer: Pseudo-phys frame slot being filled





by new


                      frame, or zero if none.






Write memory barrier (WMB).






Write ent->flags, inc. valid type.









Invalidating an unused GTF_permit_access entry:




flags = ent->flags.






Observe that !(flags & (GTF_reading|GTF_writing)).






Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).





NB. No need for WMB as reuse of entry is control-dependent on



success
of


 step 3, and all architectures guarantee ordering of ctrl-





dep writes.






Invalidating an in-use GTF_permit_access entry:



This cannot be done directly. Request assistance from the



domain
controller


which can set a timeout on the use of a grant entry and take



necessary


action. (NB. This is not yet implemented!).







Invalidating an unused GTF_accept_transfer entry:




flags = ent->flags.






Observe that !(flags & GTF_transfer_committed). [*]






Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).





NB. No need for WMB as reuse of entry is control-dependent on



success
of


 step 3, and all architectures guarantee ordering of ctrl-





dep writes.


[*] If GTF_transfer_committed is set then the grant entry is



'committed'.


 The guest must /not/ modify the grant entry until the





address of
the


 transferred frame is written. It is safe for the guest to





spin waiting


 for this to occur (detect by observing





GTF_transfer_completed in


 ent->flags).









Invalidating a committed GTF_accept_transfer entry:




Wait for (ent->flags & GTF_transfer_completed).









Changing a GTF_permit_access from writable to read-only:



Use SMP-safe CMPXCHG to set GTF_readonly, while



checking !GTF_writing.






Changing a GTF_permit_access from read-only to writable:



Use SMP-safe bit-setting instruction.


*/


+/*


Reference to a grant entry in a specified domain's grant table.


*/

+typedef u32 grant_ref_t;



+/*


A grant table comprises a packed array of grant entries in one



or more


page frames shared between Xen and a guest.



[XEN]: This field is written by Xen and read by the sharing



guest.


[GST]: This field is written by the guest and read by Xen.


*/


+/*


Version 1 of the grant table entry structure is maintained



purely


for backwards compatibility.  New guests should use version 2.


*/

+struct grant_entry_v1 {

/* GTF_xxx: various type and flag information.  [XEN,GST] */
u16 flags;
/* The domain being granted foreign privileges. [GST] */
domid_t  domid;
/*
* GTF_permit_access: Frame that @domid is allowed to map and



access. [GST]

* GTF_accept_transfer: Frame whose ownership transferred by



@domid. [XEN]

*/


u32 frame;

+};



+/*


Type of grant entry.



GTF_invalid: This grant entry grants no privileges.



GTF_permit_access: Allow @domid to map/access @frame.



GTF_accept_transfer: Allow @domid to transfer ownership of one



page
frame


                  to this guest. Xen writes the page number





to
@frame.


GTF_transitive: Allow @domid to transitively access a subrange



of


             @trans_grant in @trans_domid.  No mappings are





allowed.

*/

+#define GTF_invalid         (0U << 0)
+#define GTF_permit_access   (1U << 0)
+#define GTF_accept_transfer (2U << 0)
+#define GTF_transitive      (3U << 0)
+#define GTF_type_mask       (3U << 0)



+/*


Subflags for GTF_permit_access.



GTF_readonly: Restrict @domid to read-only mappings and



accesses.
[GST]


GTF_reading: Grant entry is currently mapped for reading by



@domid.
[XEN]


GTF_writing: Grant entry is currently mapped for writing by



@domid.
[XEN]


GTF_sub_page: Grant access to only a subrange of the



page.  @domid


           will only be allowed to copy from the grant, and





not


           map it. [GST]




*/

+#define _GTF_readonly       (2)
+#define GTF_readonly        (1U << _GTF_readonly)
+#define _GTF_reading        (3)
+#define GTF_reading         (1U << _GTF_reading)
+#define _GTF_writing        (4)
+#define GTF_writing         (1U << _GTF_writing)
+#define _GTF_sub_page       (8)
+#define GTF_sub_page        (1U << _GTF_sub_page)



+/*


Subflags for GTF_accept_transfer:



GTF_transfer_committed: Xen sets this flag to indicate that it



is
committed


 to transferring ownership of a page frame. When a guest





sees this
flag


 it must /not/ modify the grant entry until





GTF_transfer_completed
is


 set by Xen.





GTF_transfer_completed: It is safe for the guest to spin-wait



on this flag


 after reading GTF_transfer_committed. Xen will always





write the
frame


 address, followed by ORing this flag, in a timely manner.




*/

+#define _GTF_transfer_committed (2)
+#define GTF_transfer_committed  (1U << _GTF_transfer_committed)
+#define _GTF_transfer_completed (3)
+#define GTF_transfer_completed  (1U << _GTF_transfer_completed)



+/*


Version 2 grant table entries.  These fulfil the same role as



version 1 entries, but can represent more complicated



operations.


Any given domain will have either a version 1 or a version 2



table,


and every entry in the table will be the same version.







The interface by which domains use grant references does not



depend


on the grant table version in use by the other domain.


*/


+/*


Version 1 and version 2 grant entries share a common



prefix.  The


fields of the prefix are documented as part of struct



grant_entry_v1.


*/

+struct grant_entry_header {

u16 flags;
domid_t  domid;

+};



+/*


Version 2 of the grant entry structure, here is a union because



three


different types are suppotted: full_page, sub_page and



transitive.

*/

+union grant_entry_v2 {

struct grant_entry_header hdr;

/*
* This member is used for V1-style full page grants, where



either:

*


* -- hdr.type is GTF_accept_transfer, or


* -- hdr.type is GTF_permit_access and GTF_sub_page is not



set.

*


* In that case, the frame field has the same semantics as the


* field of the same name in the V1 entry structure.


*/


struct {
struct grant_entry_header hdr;
u32 pad0;
u64 frame;
} full_page;

/*
* If the grant type is GTF_grant_access and GTF_sub_page is



set,

* @domid is allowed to access bytes [@page_off,@



page_off+@length)

* in frame @frame.


*/


struct {
struct grant_entry_header hdr;
u16 page_off;
u16 length;
u64 frame;
} sub_page;

/*
* If the grant is GTF_transitive, @domid is allowed to use the


* grant @gref in domain @trans_domid, as if it was the local


* domain.  Obviously, the transitive access must be compatible


* with the original grant.


*/


struct {
struct grant_entry_header hdr;
domid_t trans_domid;
u16 pad0;
grant_ref_t gref;
} transitive;

u32 __spacer[4]; /* Pad to a power of two */

+};



+typedef u16 grant_status_t;



+/***********************************


GRANT TABLE QUERIES AND USES


*/


+/*


Handle to track a mapping created via a grant reference.


*/

+typedef u32 grant_handle_t;



+/*


GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for



access


by devices and/or host CPUs. If successful, <handle> is a



tracking number


that must be presented later to destroy the mapping(s). On



error,
<handle>
+ * is a negative status code.
+ * NOTES:
+ *  1. If GNTMAP_device_map is specified then <dev_bus_addr> is
the
address
+ *     via which I/O devices may access the granted frame.
+ *  2. If GNTMAP_host_map is specified then a mapping will be
added at
+ *     either a host virtual address in the current address space,
or at
+ *     a PTE at the specified machine address.  The type of
mapping to
+ *     perform is selected through the GNTMAP_contains_pte flag,
and the
+ *     address is specified in <host_addr>.
+ *  3. Mappings should only be destroyed via
GNTTABOP_unmap_grant_ref.
If a
+ *     host mapping is destroyed by other means then it is *NOT*
guaranteed
+ *     to be accounted to the correct grant reference!
+ */
+#define GNTTABOP_map_grant_ref        0
+struct gnttab_map_grant_ref {
+	/* IN parameters. */
+	u64 host_addr;
+	u32 flags;               /* GNTMAP_* */
+	grant_ref_t ref;
+	domid_t  dom;
+	/* OUT parameters. */
+	s16  status;              /* GNTST_* */
+	grant_handle_t handle;
+	u64 dev_bus_addr;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
+
+/*
+ * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference
mappings
+ * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero,
that
+ * field is ignored. If non-zero, they must refer to a device/host
mapping
+ * that is tracked by <handle>
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping
is not
+ *     tracked by <handle>.
+ *  3. After executing a batch of unmaps, it is guaranteed that no
stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_grant_ref      1
+struct gnttab_unmap_grant_ref {
+	/* IN parameters. */
+	u64 host_addr;
+	u64 dev_bus_addr;
+	grant_handle_t handle;
+	/* OUT parameters. */
+	s16  status;              /* GNTST_* */
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
+
+/*
+ * GNTTABOP_setup_table: Set up a grant table for <dom> comprising
at
least
+ * <nr_frames> pages. The frame addresses are written to the
<frame_list>.
+ * Only <nr_frames> addresses are written, even if the table is
larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> !=
DOMID_SELF.
+ *  3. Xen may not support more than a single grant-table page per
domain.
+ */
+#define GNTTABOP_setup_table          2
+struct gnttab_setup_table {
+	/* IN parameters. */
+	domid_t  dom;
+	u32 nr_frames;
+	/* OUT parameters. */
+	s16  status;              /* GNTST_* */
+
+	GUEST_HANDLE(xen_pfn_t)frame_list;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
+
+/*
+ * GNTTABOP_dump_table: Dump the contents of the grant table to
the
+ * xen console. Debugging use only.
+ */
+#define GNTTABOP_dump_table           3
+struct gnttab_dump_table {
+	/* IN parameters. */
+	domid_t dom;
+	/* OUT parameters. */
+	s16 status;               /* GNTST_* */
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
+
+/*
+ * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign
domain. The
+ * foreign domain has previously registered its interest in the
transfer via
+ * <domid, ref>.
+ *
+ * Note that, even if the transfer fails, the specified page no
longer belongs
+ * to the calling domain *unless* the error is GNTST_bad_page.
+ */
+#define GNTTABOP_transfer                4
+struct gnttab_transfer {
+	/* IN parameters. */
+	xen_pfn_t mfn;
+	domid_t       domid;
+	grant_ref_t   ref;
+	/* OUT parameters. */
+	s16       status;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
+
+/*
+ * GNTTABOP_copy: Hypervisor based copy
+ * source and destinations can be eithers MFNs or, for foreign
domains,
+ * grant references. the foreign domain has to grant read/write
access
+ * in its grant table.
+ *
+ * The flags specify what type source and destinations are (either
MFN
+ * or grant reference).
+ *
+ * Note that this can also be used to copy data between two
domains
+ * via a third party if the source and destination domains had
previously
+ * grant appropriate access to their pages to the third party.
+ *
+ * source_offset specifies an offset in the source frame,
dest_offset
+ * the offset in the target frame and  len specifies the number of
+ * bytes to be copied.
+ */
+
+#define _GNTCOPY_source_gref      (0)
+#define GNTCOPY_source_gref       (1 << _GNTCOPY_source_gref)
+#define _GNTCOPY_dest_gref        (1)
+#define GNTCOPY_dest_gref         (1 << _GNTCOPY_dest_gref)
+
+#define GNTTABOP_copy                 5
+struct gnttab_copy {
+	/* IN parameters. */
+	struct {
+		union {
+			grant_ref_t ref;
+			xen_pfn_t   gmfn;
+		} u;
+		domid_t  domid;
+		u16 offset;
+	} source, dest;
+	u16      len;
+	u16      flags;          /* GNTCOPY_* */
+	/* OUT parameters. */
+	s16       status;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
+
+/*
+ * GNTTABOP_query_size: Query the current and maximum sizes of the
shared
+ * grant table.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> !=
DOMID_SELF.
+ */
+#define GNTTABOP_query_size           6
+struct gnttab_query_size {
+	/* IN parameters. */
+	domid_t  dom;
+	/* OUT parameters. */
+	u32 nr_frames;
+	u32 max_nr_frames;
+	s16  status;              /* GNTST_* */
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+
+/*
+ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference
mappings
+ * tracked by <handle> but atomically replace the page table entry
with one
+ * pointing to the machine address under <new_addr>.  <new_addr>
will
be
+ * redirected to the null entry.
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping
is not
+ *     tracked by <handle>.
+ *  2. After executing a batch of unmaps, it is guaranteed that no
stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_and_replace    7
+struct gnttab_unmap_and_replace {
+	/* IN parameters. */
+	u64 host_addr;
+	u64 new_addr;
+	grant_handle_t handle;
+	/* OUT parameters. */
+	s16  status;              /* GNTST_* */
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+
+/*
+ * GNTTABOP_set_version: Request a particular version of the grant
+ * table shared table structure.  This operation can only be
performed
+ * once in any given domain.  It must be performed before any
grants
+ * are activated; otherwise, the domain will be stuck with version
1.
+ * The only defined versions are 1 and 2.
+ */
+#define GNTTABOP_set_version          8
+struct gnttab_set_version {
+	/* IN parameters */
+	u32 version;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
+
+/*
+ * GNTTABOP_get_status_frames: Get the list of frames used to
store grant
+ * status for <dom>. In grant format version 2, the status is
separated
+ * from the other shared grant fields to allow more efficient
synchronization
+ * using barriers instead of atomic cmpexch operations.
+ * <nr_frames> specify the size of vector <frame_list>.
+ * The frame addresses are returned in the <frame_list>.
+ * Only <nr_frames> addresses are returned, even if the table is
larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> !=
DOMID_SELF.
+ */
+#define GNTTABOP_get_status_frames     9
+struct gnttab_get_status_frames {
+	/* IN parameters. */
+	u32 nr_frames;
+	domid_t  dom;
+	/* OUT parameters. */
+	s16  status;              /* GNTST_* */
+
+	GUEST_HANDLE(u64)frame_list;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
+
+/*
+ * GNTTABOP_get_version: Get the grant table version which is in
+ * effect for domain <dom>.
+ */
+#define GNTTABOP_get_version          10
+struct gnttab_get_version {
+	/* IN parameters */
+	domid_t dom;
+	u16 pad;
+	/* OUT parameters */
+	u32 version;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
+
+/*
+ * Issue one or more cache maintenance operations on a portion of
a
+ * page granted to the calling domain by a foreign domain.
+ */
+#define GNTTABOP_cache_flush          12
+struct gnttab_cache_flush {
+	union {
+		u64 dev_bus_addr;
+		grant_ref_t ref;
+	} a;
+	u16 offset;   /* offset from start of grant */
+	u16 length;   /* size within the grant */
+#define GNTTAB_CACHE_CLEAN          (1 << 0)
+#define GNTTAB_CACHE_INVAL          (1 << 1)
+#define GNTTAB_CACHE_SOURCE_GREF    (1 << 31)
+	u32 op;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(gnttab_cache_flush);
+
+/*
+ * Bitfield values for update_pin_status.flags.
+ */
+ /* Map the grant entry for access by I/O devices. */
+#define _GNTMAP_device_map      (0)
+#define GNTMAP_device_map       (1 << _GNTMAP_device_map)
+/* Map the grant entry for access by host CPUs. */
+#define _GNTMAP_host_map        (1)
+#define GNTMAP_host_map         (1 << _GNTMAP_host_map)
+/* Accesses to the granted frame will be restricted to read-only
access. */
+#define _GNTMAP_readonly        (2)
+#define GNTMAP_readonly         (1 << _GNTMAP_readonly)
+/*
+ * GNTMAP_host_map subflag:
+ *  0 => The host mapping is usable only by the guest OS.
+ *  1 => The host mapping is usable by guest OS + current
application.
+ */
+#define _GNTMAP_application_map (3)
+#define GNTMAP_application_map  (1 << _GNTMAP_application_map)
+
+/*
+ * GNTMAP_contains_pte subflag:
+ *  0 => This map request contains a host virtual address.
+ *  1 => This map request contains the machine addess of the PTE
to
update.
+ */
+#define _GNTMAP_contains_pte    (4)
+#define GNTMAP_contains_pte     (1 << _GNTMAP_contains_pte)
+
+/*
+ * Bits to be placed in guest kernel available PTE bits
(architecture
+ * dependent; only supported when XENFEAT_gnttab_map_avail_bits is
set).
+ */
+#define _GNTMAP_guest_avail0    (16)
+#define GNTMAP_guest_avail_mask ((u32)~0 << _GNTMAP_guest_avail0)
+
+/*
+ * Values for error status returns. All errors are -ve.
+ */
+#define GNTST_okay             (0)  /* Normal return.
*/
+#define GNTST_general_error    (-1) /* General undefined error.
*/
+#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.
*/
+#define GNTST_bad_gntref       (-3) /* Unrecognised or
inappropriate
gntref. */
+#define GNTST_bad_handle       (-4) /* Unrecognised or
inappropriate
handle. */
+#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual
address to
map. */
+#define GNTST_bad_dev_addr     (-6) /* Inappropriate device
address to
unmap.*/
+#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.
*/
+#define GNTST_permission_denied (-8) /* Not enough privilege for
operation.
*/
+#define GNTST_bad_page         (-9) /* Specified page was invalid
for op.
*/
+#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page
boundary.   */
+#define GNTST_address_too_big (-11) /* transfer page address too
large.
*/
+#define GNTST_eagain          (-12) /* Operation not done; try
again.
*/
+
+#define GNTTABOP_error_msgs {                   \
+	"okay",                                     \
+	"undefined error",                          \
+	"unrecognised domain id",                   \
+	"invalid grant reference",                  \
+	"invalid mapping handle",                   \
+	"invalid virtual address",                  \
+	"invalid device address",                   \
+	"no spare translation slot in the I/O MMU", \
+	"permission denied",                        \
+	"bad page",                                 \
+	"copy arguments cross page boundary",       \
+	"page address size too large",              \
+	"operation not done; try again"             \
+}
+
+#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff --git a/include/xen/interface/hvm/hvm_op.h
b/include/xen/interface/hvm/hvm_op.h
new file mode 100644
index 0000000000..1c53cad729
--- /dev/null
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -0,0 +1,69 @@
+/*
+ * Permission is hereby granted, free of charge, to any person
obtaining a
copy
+ * of this software and associated documentation files (the
"Software"), to
+ * deal in the Software without restriction, including without
limitation the
+ * rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the
Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
+#define __XEN_PUBLIC_HVM_HVM_OP_H__
+
+/* Get/set subcommands: the second argument of the hypercall is a
+ * pointer to a xen_hvm_param struct.
+ */
+#define HVMOP_set_param           0
+#define HVMOP_get_param           1
+struct xen_hvm_param {
+	domid_t  domid;    /* IN */
+	u32 index;    /* IN */
+	u64 value;    /* IN/OUT */
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
+
+/* Hint from PV drivers for pagetable destruction. */
+#define HVMOP_pagetable_dying       9
+struct xen_hvm_pagetable_dying {
+	/* Domain with a pagetable about to be destroyed. */
+	domid_t  domid;
+	/* guest physical address of the toplevel pagetable dying */
+	aligned_u64 gpa;
+};
+
+typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
+
+enum hvmmem_type_t {
+	HVMMEM_ram_rw,             /* Normal read/write guest RAM */
+	HVMMEM_ram_ro,             /* Read-only; writes are discarded
*/
+	HVMMEM_mmio_dm,            /* Reads and write go to the device
model */
+};
+
+#define HVMOP_get_mem_type    15
+/* Return hvmmem_type_t for the specified pfn. */
+struct xen_hvm_get_mem_type {
+	/* Domain to be queried. */
+	domid_t domid;
+	/* OUT variable. */
+	u16 mem_type;
+	u16 pad[2]; /* align next field on 8-byte boundary */
+	/* IN variable. */
+	u64 pfn;
+};
+
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_get_mem_type);
+
+#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff --git a/include/xen/interface/hvm/params.h
b/include/xen/interface/hvm/params.h
new file mode 100644
index 0000000000..4d61fc58d9
--- /dev/null
+++ b/include/xen/interface/hvm/params.h
@@ -0,0 +1,127 @@
+/*
+ * Permission is hereby granted, free of charge, to any person
obtaining a
copy
+ * of this software and associated documentation files (the
"Software"), to
+ * deal in the Software without restriction, including without
limitation the
+ * rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the
Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
+#define __XEN_PUBLIC_HVM_PARAMS_H__
+
+#include <xen/interface/hvm/hvm_op.h>
+
+/*
+ * Parameter space for HVMOP_{set,get}_param.
+ */
+
+#define HVM_PARAM_CALLBACK_IRQ 0
+/*
+ * How should CPU0 event-channel notifications be delivered?
+ *
+ * If val == 0 then CPU0 event-channel notifications are not
delivered.
+ * If val != 0, val[63:56] encodes the type, as follows:
+ */
+
+#define HVM_PARAM_CALLBACK_TYPE_GSI      0
+/*
+ * val[55:0] is a delivery GSI.  GSI 0 cannot be used, as it
aliases val == 0,
+ * and disables all notifications.
+ */
+
+#define HVM_PARAM_CALLBACK_TYPE_PCI_INTX 1
+/*
+ * val[55:0] is a delivery PCI INTx line:
+ * Domain = val[47:32], Bus = val[31:16] DevFn = val[15:8], IntX =
val[1:0]
+ */
+
+#if defined(__i386__) || defined(__x86_64__)
+#define HVM_PARAM_CALLBACK_TYPE_VECTOR   2
+/*
+ * val[7:0] is a vector number.  Check for
XENFEAT_hvm_callback_vector to
know
+ * if this delivery method is available.
+ */
+#elif defined(__arm__) || defined(__aarch64__)
+#define HVM_PARAM_CALLBACK_TYPE_PPI      2
+/*
+ * val[55:16] needs to be zero.
+ * val[15:8] is interrupt flag of the PPI used by event-channel:
+ *  bit 8: the PPI is edge(1) or level(0) triggered
+ *  bit 9: the PPI is active low(1) or high(0)
+ * val[7:0] is a PPI number used by event-channel.
+ * This is only used by ARM/ARM64 and masking/eoi the interrupt
associated
to
+ * the notification is handled by the interrupt controller.
+ */
+#endif
+
+#define HVM_PARAM_STORE_PFN    1
+#define HVM_PARAM_STORE_EVTCHN 2
+
+#define HVM_PARAM_PAE_ENABLED  4
+
+#define HVM_PARAM_IOREQ_PFN    5
+
+#define HVM_PARAM_BUFIOREQ_PFN 6
+
+/*
+ * Set mode for virtual timers (currently x86 only):
+ *  delay_for_missed_ticks (default):
+ *   Do not advance a vcpu's time beyond the correct delivery time
for
+ *   interrupts that have been missed due to preemption. Deliver
missed
+ *   interrupts when the vcpu is rescheduled and advance the
vcpu's virtual
+ *   time stepwise for each one.
+ *  no_delay_for_missed_ticks:
+ *   As above, missed interrupts are delivered, but guest time
always tracks
+ *   wallclock (i.e., real) time while doing so.
+ *  no_missed_ticks_pending:
+ *   No missed interrupts are held pending. Instead, to ensure
ticks are
+ *   delivered at some non-zero rate, if we detect missed ticks
then the
+ *   internal tick alarm is not disabled if the VCPU is preempted
during the
+ *   next tick period.
+ *  one_missed_tick_pending:
+ *   Missed interrupts are collapsed together and delivered as one
'late
tick'.
+ *   Guest time always tracks wallclock (i.e., real) time.
+ */
+#define HVM_PARAM_TIMER_MODE   10
+#define HVMPTM_delay_for_missed_ticks    0
+#define HVMPTM_no_delay_for_missed_ticks 1
+#define HVMPTM_no_missed_ticks_pending   2
+#define HVMPTM_one_missed_tick_pending   3
+
+/* Boolean: Enable virtual HPET (high-precision event timer)?
(x86-only) */
+#define HVM_PARAM_HPET_ENABLED 11
+
+/* Identity-map page directory used by Intel EPT when CR0.PG=0. */
+#define HVM_PARAM_IDENT_PT     12
+
+/* Device Model domain, defaults to 0. */
+#define HVM_PARAM_DM_DOMAIN    13
+
+/* ACPI S state: currently support S0 and S3 on x86. */
+#define HVM_PARAM_ACPI_S_STATE 14
+
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS     15
+
+/* Boolean: Enable aligning all periodic vpts to reduce interrupts
*/
+#define HVM_PARAM_VPT_ALIGN    16
+
+/* Console debug shared memory ring and event channel */
+#define HVM_PARAM_CONSOLE_PFN    17
+#define HVM_PARAM_CONSOLE_EVTCHN 18
+
+#define HVM_NR_PARAMS          19
+
+#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/include/xen/interface/io/blkif.h
b/include/xen/interface/io/blkif.h
new file mode 100644
index 0000000000..7d74c99226
--- /dev/null
+++ b/include/xen/interface/io/blkif.h
@@ -0,0 +1,726 @@
+/************************************************************
******************
+ * blkif.h
+ *
+ * Unified block-device I/O interface for Xen guest OSes.
+ *
+ * Permission is hereby granted, free of charge, to any person
obtaining a
copy
+ * of this software and associated documentation files (the
"Software"), to
+ * deal in the Software without restriction, including without
limitation the
+ * rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the
Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser
+ * Copyright (c) 2012, Spectra Logic Corporation
+ */
+
+#ifndef __XEN_PUBLIC_IO_BLKIF_H__
+#define __XEN_PUBLIC_IO_BLKIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/*
+ * Front->back notifications: When enqueuing a new request,
sending a
+ * notification can be made conditional on req_event (i.e., the
generic
+ * hold-off mechanism provided by the ring macros). Backends must
set
+ * req_event appropriately (e.g., using
RING_FINAL_CHECK_FOR_REQUESTS()).
+ *
+ * Back->front notifications: When enqueuing a new response,
sending a
+ * notification can be made conditional on rsp_event (i.e., the
generic
+ * hold-off mechanism provided by the ring macros). Frontends must
set
+ * rsp_event appropriately (e.g., using
RING_FINAL_CHECK_FOR_RESPONSES()).
+ */
+
+#ifndef blkif_vdev_t
+#define blkif_vdev_t   u16
+#endif
+#define blkif_sector_t u64
+
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen block driver utilize nodes within the
XenStore to
+ * communicate capabilities and to negotiate operating
parameters.  This
+ * section enumerates these nodes which reside in the respective
front and
+ * backend portions of the XenStore, following the XenBus
convention.
+ *
+ * All data in the XenStore is stored as strings.  Nodes
specifying numeric
+ * values are encoded in decimal.  Integer value ranges listed
below are
+ * expressed as fixed sized integer types capable of storing the
conversion
+ * of a properly formated node string, without loss of
information.
+ *
+ * Any specified default value is in effect if the corresponding
XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use
by the
+ * driver side whose XenBus tree contains them.
+ *
+ * XenStore nodes marked "DEPRECATED" in their notes section
should only
be
+ * used to provide interoperability with legacy implementations.
+ *
+ * See the XenBus state transition diagram below for details on
when XenBus
+ * nodes must be published and when they can be queried.
+ *
+
**************************************************************
***************
+ *                            Backend XenBus Nodes
+
**************************************************************
***************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ---
---------------
+ *
+ * mode
+ *      Values:         "r" (read only), "w" (writable)
+ *
+ *      The read or write access permissions to the backing store
to be
+ *      granted to the frontend.
+ *
+ * params
+ *      Values:         string
+ *
+ *      A free formatted string providing sufficient information
for the
+ *      hotplug script to attach the device and provide a suitable
+ *      handler (ie: a block device) for blkback to use.
+ *
+ * physical-device
+ *      Values:         "MAJOR:MINOR"
+ *      Notes: 11
+ *
+ *      MAJOR and MINOR are the major number and minor number of
the
+ *      backing device respectively.
+ *
+ * physical-device-path
+ *      Values:         path string
+ *
+ *      A string that contains the absolute path to the disk
image. On
+ *      NetBSD and Linux this is always a block device, while on
FreeBSD
+ *      it can be either a block device or a regular file.
+ *
+ * type
+ *      Values:         "file", "phy", "tap"
+ *
+ *      The type of the backing device/object.
+ *
+ *
+ * direct-io-safe
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      The underlying storage is not affected by the direct IO
memory
+ *      lifetime bug.  See:
+ *
https://urldefense.com/v3/__https://eur01.safelinks.protection.outlook.com/?...
...
...
n.org%2Farchives%2Fhtml%2Fxen-devel%2F2012-12%2Fmsg01154.html&am
p;data=02%7C01%7Cpeng.fan%40nxp.com%7Cdd87f4854f514bc096ba08d81
ddc0812%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C63729217
8170181802&amp;sdata=wXiKB5EvbBokB%2BYrOdMDiKDBwSHo8m1ssXFp0K
RQ0Io%3D&amp;reserved=0






 Therefore this option gives the backend permission to use





 O_DIRECT, notwithstanding that bug.









 That is, if this option is enabled, use of O_DIRECT is





safe,


 in circumstances where we would normally have avoided it





as a


 workaround for that bug.  This option is not relevant for





all


 backends, and even not necessarily supported for those for





 which it is relevant.  A backend which knows that it is





not


 affected by the bug can ignore this option.









 This option doesn't require a backend to use O_DIRECT, so





it


 should not be used to try to control the caching





behaviour.





*--------------------------------- Features -------------------








feature-barrier



 Values:         0/1 (boolean)





 Default Value:  0









 A value of "1" indicates that the backend can process





requests


 containing the BLKIF_OP_WRITE_BARRIER request opcode.





Requests


 of this type may still be returned at any time with the





 BLKIF_RSP_EOPNOTSUPP result code.









feature-flush-cache



 Values:         0/1 (boolean)





 Default Value:  0









 A value of "1" indicates that the backend can process





requests


 containing the BLKIF_OP_FLUSH_DISKCACHE request opcode.





Requests


 of this type may still be returned at any time with the





 BLKIF_RSP_EOPNOTSUPP result code.









feature-discard



 Values:         0/1 (boolean)





 Default Value:  0









 A value of "1" indicates that the backend can process





requests


 containing the BLKIF_OP_DISCARD request opcode.  Requests





 of this type may still be returned at any time with the





 BLKIF_RSP_EOPNOTSUPP result code.









feature-persistent



 Values:         0/1 (boolean)





 Default Value:  0





 Notes: 7









 A value of "1" indicates that the backend can keep the





grants used


 by the frontend driver mapped, so the same set of grants





should be


 used in all transactions. The maximum number of grants the





backend


 can map persistently depends on the implementation, but





ideally it


 should be RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.





Using this


 feature the backend doesn't need to unmap each grant,





preventing


 costly TLB flushes. The backend driver should only map





grants


 persistently if the frontend supports it. If a backend





driver chooses


 to use the persistent protocol when the frontend doesn't





support it,


 it will probably hit the maximum number of persistently





mapped
grants


 (due to the fact that the frontend won't be reusing the





same
grants),


 and fall back to non-persistent mode. Backend





implementations
may


 shrink or expand the number of persistently mapped grants





without


 notifying the frontend depending on memory constraints





(this might


 cause a performance degradation).









 If a backend driver wants to limit the maximum number of





persistently


 mapped grants to a value less than RING_SIZE *





 BLKIF_MAX_SEGMENTS_PER_REQUEST a LRU strategy should be





used to


 discard the grants that are less commonly used. Using a





LRU in the


 backend driver paired with a LIFO queue in the frontend





will


 allow us to have better performance in this scenario.








*----------------------- Request Transport Parameters ---------








max-ring-page-order



 Values:         <uint32_t>





 Default Value:  0





 Notes:          1, 3









 The maximum supported size of the request ring buffer in





units of


 lb(machine pages). (e.g. 0 == 1 page,  1 = 2 pages, 2 == 4





pages,


 etc.).









max-ring-pages



 Values:         <uint32_t>





 Default Value:  1





 Notes:          DEPRECATED, 2, 3









 The maximum supported size of the request ring buffer in





units of


 machine pages.  The value must be a power of 2.








*------------------------- Backend Device Properties ------------








discard-enable



 Values:         0/1 (boolean)





 Default Value:  1









 This optional property, set by the toolstack, instructs





the backend


 to offer (or not to offer) discard to the frontend. If the





property


 is missing the backend should offer discard if the backing





storage


 actually supports it.









discard-alignment



 Values:         <uint32_t>





 Default Value:  0





 Notes:          4, 5









 The offset, in bytes from the beginning of the virtual





block device,


 to the first, addressable, discard extent on the





underlying device.






discard-granularity



 Values:         <uint32_t>





 Default Value:  <"sector-size">





 Notes:          4









 The size, in bytes, of the individually addressable





discard extents


 of the underlying device.









discard-secure



 Values:         0/1 (boolean)





 Default Value:  0





 Notes:          10









 A value of "1" indicates that the backend can process





BLKIF_OP_DISCARD


 requests with the BLKIF_DISCARD_SECURE flag set.









info



 Values:         <uint32_t> (bitmap)









 A collection of bit flags describing attributes of the





backing


 device.  The VDISK_* macros define the meaning of each bit





 location.









sector-size



 Values:         <uint32_t>









 The logical block size, in bytes, of the underlying





storage. This


 must be a power of two with a minimum value of 512.









 NOTE: Because of implementation bugs in some frontends





this
must be


       set to 512, unless the frontend advertizes a non-





zero value


       in its "feature-large-sector-size" xenbus node. (See





below).






physical-sector-size



 Values:         <uint32_t>





 Default Value:  <"sector-size">









 The physical block size, in bytes, of the backend storage.





This


 must be an integer multiple of "sector-size".









sectors



 Values:         <u64>









 The size of the backend device, expressed in units of





"sector-size".


 The product of "sector-size" and "sectors" must also be an





integer


 multiple of "physical-sector-size", if that node is





present.











                       Frontend XenBus Nodes













*----------------------- Request Transport Parameters ---------








event-channel



 Values:         <uint32_t>









 The identifier of the Xen event channel used to signal





activity


 in the ring buffer.









ring-ref



 Values:         <uint32_t>





 Notes:          6









 The Xen grant reference granting permission for the





backend to
map


 the sole page in a single page sized ring buffer.









ring-ref%u



 Values:         <uint32_t>





 Notes:          6









 For a frontend providing a multi-page ring, a "number of





ring pages"


 sized list of nodes, each containing a Xen grant reference





granting


 permission for the backend to map the page of the ring





located


 at page index "%u".  Page indexes are zero based.









protocol



 Values:         string (XEN_IO_PROTO_ABI_*)





 Default Value:  XEN_IO_PROTO_ABI_NATIVE









 The machine ABI rules governing the format of all ring





request and


 response structures.









ring-page-order



 Values:         <uint32_t>





 Default Value:  0





 Maximum Value:  MAX(ffs(max-ring-pages) - 1,





max-ring-page-order)


 Notes:          1, 3









 The size of the frontend allocated request ring buffer in





units


 of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 ==





4 pages,


 etc.).









num-ring-pages



 Values:         <uint32_t>





 Default Value:  1





 Maximum Value:  MAX(max-ring-pages,(0x1 <<





max-ring-page-order))


 Notes:          DEPRECATED, 2, 3









 The size of the frontend allocated request ring buffer in





units of


 machine pages.  The value must be a power of 2.








*--------------------------------- Features -------------------








feature-persistent



 Values:         0/1 (boolean)





 Default Value:  0





 Notes: 7, 8, 9









 A value of "1" indicates that the frontend will reuse the





same grants


 for all transactions, allowing the backend to map them





with write


 access (even when it should be read-only). If the frontend





hits the


 maximum number of allowed persistently mapped grants, it





can
fallback


 to non persistent mode. This will cause a performance





degradation,


 since the the backend driver will still try to map those





grants


 persistently. Since the persistent grants protocol is





compatible with


 the previous protocol, a frontend driver can choose to





work in


 persistent mode even when the backend doesn't support it.









 It is recommended that the frontend driver stores the





persistently


 mapped grants in a LIFO queue, so a subset of all





persistently
mapped


 grants gets used commonly. This is done in case the





backend driver


 decides to limit the maximum number of persistently mapped





grants


 to a value less than RING_SIZE *





BLKIF_MAX_SEGMENTS_PER_REQUEST.






feature-large-sector-size



 Values:         0/1 (boolean)





 Default Value:  0









 A value of "1" indicates that the frontend will correctly





supply and


 interpret all sector-based quantities in terms of the





"sector-size"


 value supplied in the backend info, whatever that may be





set to.


 If this node is not present or its value is "0" then it is





assumed


 that the frontend requires that the logical block size is





512 as it


 is hardcoded (which is the case in some frontend





implementations).





*------------------------- Virtual Device Properties ------------








device-type



 Values:         "disk", "cdrom", "floppy", etc.









virtual-device



 Values:         <uint32_t>









 A value indicating the physical device to virtualize





within the


 frontend's domain.  (e.g. "The first ATA disk", "The third





SCSI


 disk", etc.)









 See docs/misc/vbd-interface.txt for details on the format





of this


 value.









Notes








(1) Multi-page ring buffer scheme first developed in the Citrix



XenServer


PV drivers.





(2) Multi-page ring buffer scheme first used in some RedHat



distributions


including a distribution deployed on certain nodes of the





Amazon


EC2 cluster.





(3) Support for multi-page ring buffers was implemented



independently,


in slightly different forms, by both Citrix and





RedHat/Amazon.


For full interoperability, block front and backends should





publish


identical ring parameters, adjusted for unit differences,





to the


XenStore nodes used in both schemes.





(4) Devices that support discard functionality may internally



allocate space


(discardable extents) in units that are larger than the





exported
logical


block size. If the backing device has such discardable





extents the


backend should provide both discard-granularity and





discard-alignment.


Providing just one of the two may be considered an error by





the
frontend.


Backends supporting discard should include discard-





granularity and


discard-alignment even if it supports discarding individual





sectors.


Frontends should assume discard-alignment == 0 and





discard-granularity


== sector size if these keys are missing.





(5) The discard-alignment parameter allows a physical device to



be


partitioned into virtual devices that do not necessarily





begin or


end on a discardable extent boundary.





(6) When there is only a single page allocated to the request



ring,


'ring-ref' is used to communicate the grant reference for





this


page to the backend.  When using a multi-page ring, the





'ring-ref'


node is not created.  Instead 'ring-ref0' - 'ring-refN' are





used.


(7) When using persistent grants data has to be copied from/to



the page


where the grant is currently mapped. The overhead of doing





this
copy


however doesn't suppress the speed improvement of not





having to
unmap


the grants.





(8) The frontend driver has to allow the backend driver to map



all grants


with write access, even when they should be mapped read-





only,
since


further requests may reuse these grants and require write





permissions.


(9) Linux implementation doesn't have a limit on the maximum



number of


grants that can be persistently mapped in the frontend





driver, but


due to the frontent driver implementation it should never





be bigger


than RING_SIZE * BLKIF_MAX_SEGMENTS_PER_REQUEST.




*(10) The discard-secure property may be present and will be set

to 1 if the


backing device supports secure discard.




*(11) Only used by Linux and NetBSD.
*/


+/*


Multiple hardware queues/rings:



If supported, the backend will write the key "multi-queue-max-



queues" to


the directory for that vbd, and set its value to the maximum



supported


number of queues.



Frontends that are aware of this feature and wish to use it can



write the


key "multi-queue-num-queues" with the number they wish to use,



which
must be


greater than zero, and no more than the value reported by the



backend in


"multi-queue-max-queues".







For frontends requesting just one queue, the usual event-



channel and


ring-ref keys are written as before, simplifying the backend



processing


to avoid distinguishing between a frontend that doesn't



understand the


multi-queue feature, and one that does, but requested only one



queue.






Frontends requesting two or more queues must not write the



toplevel


event-channel and ring-ref keys, instead writing those keys



under
sub-keys


having the name "queue-N" where N is the integer ID of the



queue/ring
for


which those keys belong. Queues are indexed from zero.



For example, a frontend with two queues must write the



following set of


queue-related keys:







/local/domain/1/device/vbd/0/multi-queue-num-queues = "2"



/local/domain/1/device/vbd/0/queue-0 = ""



/local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"



/local/domain/1/device/vbd/0/queue-0/event-channel =



"<evtchn#0>"


/local/domain/1/device/vbd/0/queue-1 = ""



/local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"



/local/domain/1/device/vbd/0/queue-1/event-channel =



"<evtchn#1>"






It is also possible to use multiple queues/rings together with



feature multi-page ring buffer.



For example, a frontend requests two queues/rings and the size



of each
ring


buffer is two pages must write the following set of related



keys:






/local/domain/1/device/vbd/0/multi-queue-num-queues = "2"



/local/domain/1/device/vbd/0/ring-page-order = "1"



/local/domain/1/device/vbd/0/queue-0 = ""



/local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"



/local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"



/local/domain/1/device/vbd/0/queue-0/event-channel =



"<evtchn#0>"


/local/domain/1/device/vbd/0/queue-1 = ""



/local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"



/local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"



/local/domain/1/device/vbd/0/queue-1/event-channel =



"<evtchn#1>"





*/


+/*


STATE DIAGRAMS












                              Startup



















Tool stack creates front and back nodes with state



XenbusStateInitialising.






Front                                Back



=================================



=====================================


XenbusStateInitialising              XenbusStateInitialising



o Query virtual device               o Query backend device



identification


properties.                          data.



o Setup OS device instance.          o Open and validate



backend
device.


                                  o Publish backend





features and


                                    transport parameters.





                                                 |





                                                 |





                                                 V





                                 XenbusStateInitWait









o Query backend features and



transport parameters.



o Allocate and initialize the



request ring.



o Publish transport parameters



that will be in effect during



this connection.



         |





         |





         V





XenbusStateInitialised







                                  o Query frontend





transport parameters.


                                  o Connect to the request





ring and


                                    event channel.





                                  o Publish backend device





properties.


                                                 |





                                                 |





                                                 V





                                 XenbusStateConnected









o Query backend device properties.



o Finalize OS virtual device



instance.



         |





         |





         V





XenbusStateConnected







Note: Drivers that do not support any optional features, or the



negotiation


  of transport parameters, can skip certain states in the





state
machine:






  o A frontend may transition to XenbusStateInitialised





without


    waiting for the backend to enter





XenbusStateInitWait.  In this


    case, default transport parameters are in effect and





any


    transport parameters published by the frontend must





contain


    their default values.









  o A backend may transition to XenbusStateInitialised,





bypassing


    XenbusStateInitWait, without waiting for the frontend





to first


    enter the XenbusStateInitialised state.  In this case,





default


    transport parameters are in effect and any transport





parameters


    published by the backend must contain their default





values.






  Drivers that support optional features and/or transport





parameter


  negotiation must tolerate these additional state





transition paths.


  In general this means performing the work of any skipped





state


  transition, if it has not already been performed, in





addition to the


  work associated with entry into the current state.




*/


+/*


REQUEST CODES.


*/

+#define BLKIF_OP_READ              0
+#define BLKIF_OP_WRITE             1
+/*


All writes issued prior to a request with the



BLKIF_OP_WRITE_BARRIER


operation code ("barrier request") must be completed prior to



the


execution of the barrier request.  All writes issued after the



barrier


request must not execute until after the completion of the



barrier request.






Optional.  See "feature-barrier" XenBus node documentation



above.

*/

+#define BLKIF_OP_WRITE_BARRIER     2
+/*


Commit any uncommitted contents of the backing device's



volatile cache


to stable storage.







Optional.  See "feature-flush-cache" XenBus node documentation



above.

*/

+#define BLKIF_OP_FLUSH_DISKCACHE   3
+/*


Used in SLES sources for device specific command packet



contained within the request. Reserved for that purpose.


*/

+#define BLKIF_OP_RESERVED_1        4
+/*


Indicate to the backend device that a region of storage is no



longer in


use, and may be discarded at any time without impact to the



client.  If


the BLKIF_DISCARD_SECURE flag is set on the request, all copies



of the


discarded region on the device must be rendered unrecoverable



before
the


command returns.







This operation is analogous to performing a trim (ATA) or unamp



(SCSI),


command on a native device.







More information about trim/unmap operations can be found at:







https://urldefense.com/v3/__https://eur01.safelinks.protection.outlook.com/?...
...
...
%2FDocuments%2FUploadedDocuments%2Fdocs2008%2F&amp;data=02%7
C01%7Cpeng.fan%40nxp.com%7Cdd87f4854f514bc096ba08d81ddc0812%7C
686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C637292178170181802
&amp;sdata=JOOjsvkjqxkuoF47PMVw1loNNDhxPCXQVdPQQklTIGM%3D&am
p;reserved=0


e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc









https://urldefense.com/v3/__https://eur01.safelinks.protection.outlook.com/?...
...
...
eagate.com%2Fstaticfiles%2Fsupport%2Fdisc%2Fmanuals%2F&amp;data=02
%7C01%7Cpeng.fan%40nxp.com%7Cdd87f4854f514bc096ba08d81ddc0812%
7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C6372921781701818
02&amp;sdata=gd5Cvr1Q9%2Bv%2BfUS5OleuozBITkjbybYoR302s4XsVv8%3D
&amp;reserved=0


Interface%20manuals/100293068c.pdf









Optional.  See "feature-discard", "discard-alignment",



"discard-granularity", and "discard-secure" in the XenBus node



documentation above.


*/

+#define BLKIF_OP_DISCARD           5



+/*


Recognized if "feature-max-indirect-segments" in present in the



backend


xenbus info. The "feature-max-indirect-segments" node contains



the
maximum


number of segments allowed by the backend per request. If the



node is


present, the frontend might use blkif_request_indirect structs



in order to


issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST



(11). The


maximum number of indirect segments is fixed by the backend,



but the


frontend can issue requests with any number of indirect



segments as long
as


it's less than the number provided by the backend. The



indirect_grefs field


in blkif_request_indirect should be filled by the frontend with



the


grant references of the pages that are holding the indirect



segments.


These pages are filled with an array of blkif_request_segment



that hold
the


information about the segments. The number of indirect pages to



use is


determined by the number of segments an indirect request



contains.
Every


indirect page can contain a maximum of



(PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so



to


calculate the number of indirect pages to use we have to do



ceil(indirect_segments / (PAGE_SIZE / sizeof(struct



blkif_request_segment))).






If a backend does not recognize BLKIF_OP_INDIRECT, it should



*not*


create the "feature-max-indirect-segments" node!


*/

+#define BLKIF_OP_INDIRECT          6



+/*


Maximum scatter/gather segments per request.



This is carefully chosen so that sizeof(blkif_ring_t) <=



PAGE_SIZE.


NB. This could be 12 if the ring indexes weren't stored in the



same page.

*/

+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11



+/*


Maximum number of indirect pages to use per request.


*/

+#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8



+/*


NB. 'first_sect' and 'last_sect' in blkif_request_segment, as



well as


'sector_number' in blkif_request, blkif_request_discard and



blkif_request_indirect are sector-based quantities. See the



description


of the "feature-large-sector-size" frontend xenbus node above



for


more information.


*/

+struct blkif_request_segment {

grant_ref_t gref;        /* reference to I/O buffer

frame        */

/* @first_sect: first sector in frame to transfer

(inclusive).   */

/* @last_sect: last sector in frame to transfer

(inclusive).     */

u8     first_sect, last_sect;

+};



+/*


Starting ring element for any I/O request.


*/

+struct blkif_request {

u8        operation;    /* BLKIF_OP_???

*/

u8        nr_segments;  /* number of segments

*/

blkif_vdev_t   handle;       /* only for read/write requests

*/

u64       id;           /* private guest value, echoed in

resp  */

blkif_sector_t sector_number;/* start sector idx on disk (r/w

only)  */

struct blkif_request_segment

seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};



+typedef struct blkif_request blkif_request_t;



+/*


Cast to this structure when blkif_request.operation ==



BLKIF_OP_DISCARD


sizeof(struct blkif_request_discard) <= sizeof(struct



blkif_request)

*/

+struct blkif_request_discard {

u8        operation;    /* BLKIF_OP_DISCARD

*/

u8        flag;         /* BLKIF_DISCARD_SECURE or zero

*/
+#define BLKIF_DISCARD_SECURE (1 << 0)  /* ignored if discard-
secure=0
*/

blkif_vdev_t   handle;       /* same as for read/write requests

*/

u64       id;           /* private guest value, echoed in

resp  */

blkif_sector_t sector_number;/* start sector idx on disk

*/

u64       nr_sectors;   /* number of contiguous sectors to

discard*/
+};



+typedef struct blkif_request_discard blkif_request_discard_t;



+struct blkif_request_indirect {

u8        operation;    /* BLKIF_OP_INDIRECT

*/

u8        indirect_op;  /* BLKIF_OP_{READ/WRITE}

*/

u16       nr_segments;  /* number of segments

*/

u64       id;           /* private guest value, echoed in

resp  */

blkif_sector_t sector_number;/* start sector idx on disk (r/w

only)  */

blkif_vdev_t   handle;       /* same as for read/write requests

*/

grant_ref_t

indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+#ifdef __i386__

u64       pad;          /* Make it 64 byte aligned on i386

*/
+#endif
+};



+typedef struct blkif_request_indirect blkif_request_indirect_t;



+struct blkif_response {

u64        id;              /* copied from request */
u8         operation;       /* copied from request */
s16         status;          /* BLKIF_RSP_???       */

+};



+typedef struct blkif_response blkif_response_t;



+/*


STATUS RETURN CODES.


*/
/* Operation not supported (only happens on barrier writes). */

+#define BLKIF_RSP_EOPNOTSUPP  -2

/* Operation failed for some unspecified reason (-EIO). */

+#define BLKIF_RSP_ERROR       -1

/* Operation completed successfully. */

+#define BLKIF_RSP_OKAY         0



+/*


Generate blkif ring structures and types.


*/

+DEFINE_RING_TYPES(blkif, struct blkif_request, struct
blkif_response);



+#define VDISK_CDROM        0x1
+#define VDISK_REMOVABLE    0x2
+#define VDISK_READONLY     0x4



+#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */



+/*


Local variables:



mode: C



c-file-style: "BSD"



c-basic-offset: 4



tab-width: 4



indent-tabs-mode: nil



End:


*/

diff --git a/include/xen/interface/io/console.h
b/include/xen/interface/io/console.h
new file mode 100644
index 0000000000..3489fc7a60
--- /dev/null
+++ b/include/xen/interface/io/console.h
@@ -0,0 +1,56 @@
+/************************************************************



console.h







Console I/O interface for Xen guest OSes.







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (c) 2005, Keir Fraser


*/


+#ifndef __XEN_PUBLIC_IO_CONSOLE_H__
+#define __XEN_PUBLIC_IO_CONSOLE_H__



+typedef u32 XENCONS_RING_IDX;



+#define MASK_XENCONS_IDX(idx, ring) ((idx) & (sizeof(ring) - 1))



+struct xencons_interface {

char in[1024];
char out[2048];
XENCONS_RING_IDX in_cons, in_prod;
XENCONS_RING_IDX out_cons, out_prod;

+};



+#ifdef XEN_WANT_FLEX_CONSOLE_RING
+#include "ring.h"
+DEFINE_XEN_FLEX_RING(xencons);
+#endif



+#endif /* __XEN_PUBLIC_IO_CONSOLE_H__ */



+/*


Local variables:



mode: C



c-file-style: "BSD"



c-basic-offset: 4



tab-width: 4



indent-tabs-mode: nil



End:


*/

diff --git a/include/xen/interface/io/protocols.h
b/include/xen/interface/io/protocols.h
new file mode 100644
index 0000000000..52b4de0f81
--- /dev/null
+++ b/include/xen/interface/io/protocols.h
@@ -0,0 +1,42 @@
+/************************************************************



protocols.h







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (c) 2008, Keir Fraser


*/


+#ifndef __XEN_PROTOCOLS_H__
+#define __XEN_PROTOCOLS_H__



+#define XEN_IO_PROTO_ABI_X86_32     "x86_32-abi"
+#define XEN_IO_PROTO_ABI_X86_64     "x86_64-abi"
+#define XEN_IO_PROTO_ABI_ARM        "arm-abi"



+#if defined(__i386__)
+# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32
+#elif defined(__x86_64__)
+# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_64
+#elif defined(__arm__) || defined(__aarch64__)
+# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_ARM
+#else
+# error arch fixup needed here
+#endif



+#endif
diff --git a/include/xen/interface/io/ring.h
b/include/xen/interface/io/ring.h
new file mode 100644
index 0000000000..4e02678e3c
--- /dev/null
+++ b/include/xen/interface/io/ring.h
@@ -0,0 +1,479 @@
+/************************************************************



ring.h







Shared producer-consumer ring macros.







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Tim Deegan and Andrew Warfield November 2004.


*/


+#ifndef __XEN_PUBLIC_IO_RING_H__
+#define __XEN_PUBLIC_IO_RING_H__



+/*


When #include'ing this header, you need to provide the



following


declaration upfront:




standard integers types (u8, u16, etc)





They are provided by stdint.h of the standard headers.







In addition, if you intend to use the FLEX macros, you also



need to


provide the following, before invoking the FLEX macros:




size_t






memcpy






grant_ref_t





These declarations are provided by string.h of the standard



headers,


and grant_table.h from the Xen public headers.


*/


+#include <xen/interface/grant_table.h>



+typedef unsigned int RING_IDX;



+/* Round a 32-bit unsigned constant down to the nearest power of
two. */
+#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  :
((_x)
& 0x1))
+#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    :
__RD2(_x))
+#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    :
__RD4(_x))
+#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    :
__RD8(_x))
+#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 :
__RD16(_x))



+/*


Calculate size of a shared ring, given the total available



space for the


ring and indexes (_sz), and the name tag of the



request/response
structure.


A ring contains as many entries as will fit, rounded down to



the nearest


power of two (so we can mask with (size-1) to loop around).


*/

+#define __CONST_RING_SIZE(_s, _sz) \

(__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \
sizeof(((struct _s##_sring *)0)->ring[0])))



+/*


The same for passing in an actual pointer instead of a name



tag.

*/

+#define __RING_SIZE(_s, _sz) \

(__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)-

...
ring[0])))



+/*


Macros to make the correct C datatypes for a new kind of ring.







To make a new ring datatype, you need to have two message



structures,


let's say request_t, and response_t already defined.







In a header where you want the ring datatype declared, you then



do:






DEFINE_RING_TYPES(mytag, request_t, response_t);









These expand out to give you a set of types, as you can see



below.


The most important of these are:







mytag_sring_t      - The shared ring.





mytag_front_ring_t - The 'front' half of the ring.





mytag_back_ring_t  - The 'back' half of the ring.









To initialize a ring in your code you need to know the location



and size


of the shared memory area (PAGE_SIZE, for instance). To



initialise


the front half:







mytag_front_ring_t front_ring;





SHARED_RING_INIT((mytag_sring_t *)shared_page);





FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page,





PAGE_SIZE);






Initializing the back follows similarly (note that only the



front


initializes the shared ring):







mytag_back_ring_t back_ring;





BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page,





PAGE_SIZE);

*/


+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)
\

							

\

+/* Shared ring entry */
\
+union __name##_sring_entry
{                                                      \

__req_t req;

\

__rsp_t rsp;

\
+};
\

							

\

+/* Shared ring page */
\
+struct __name##_sring
{                                                           \

RING_IDX req_prod, req_event;

\

RING_IDX rsp_prod, rsp_event;

\

union

{
      \

struct



{                                                          \

	u8 smartpoll_active;



\

} netif;



\

struct



{                                                          \

	u8 msg;



\

} tapif_user;



\

u8 pvt_pad[4];



\

} pvt;

\

u8 __pad[44];

\

union __name##_sring_entry ring[1]; /* variable-length */

\
+};
\

							

\

+/* "Front" end's private variables */
\
+struct __name##_front_ring
{                                                      \

RING_IDX req_prod_pvt;

\

RING_IDX rsp_cons;

\

unsigned int nr_ents;

\

struct __name##_sring *sring;

\
+};
\

							

\

+/* "Back" end's private variables */
\
+struct __name##_back_ring
{                                                       \

RING_IDX rsp_prod_pvt;

\

RING_IDX req_cons;

\

unsigned int nr_ents;

\

struct __name##_sring *sring;

\
+};
\

							

\

+/* Syntactic sugar */
\
+typedef struct __name##_sring __name##_sring_t;
\
+typedef struct __name##_front_ring __name##_front_ring_t;
\
+typedef struct __name##_back_ring __name##_back_ring_t



+/*


Macros for manipulating rings.







FRONT_RING_whatever works on the "front end" of a ring: here



requests are pushed on to the ring and responses taken off it.







BACK_RING_whatever works on the "back end" of a ring: here



requests are taken off the ring and responses put on.







N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.



This is OK in 1-for-1 request-response situations where the



requestor (front end) never has more than RING_SIZE()-1



outstanding requests.


*/


+/* Initialising empty rings */
+#define SHARED_RING_INIT(_s) do
{                                                 \

(_s)->req_prod  = (_s)->rsp_prod  = 0;

\

(_s)->req_event = (_s)->rsp_event = 1;

\

(void)memset((_s)->pvt.pvt_pad, 0, sizeof((_s)->pvt.pvt_pad));

\

(void)memset((_s)->__pad, 0, sizeof((_s)->__pad));

\
+} while (0)



+#define FRONT_RING_INIT(_r, _s, __size) do
{                                      \

(_r)->req_prod_pvt = 0;

\

(_r)->rsp_cons = 0;

\

(_r)->nr_ents = __RING_SIZE(_s, __size);

\

(_r)->sring = (_s);

\
+} while (0)



+#define BACK_RING_INIT(_r, _s, __size) do
{                                       \

(_r)->rsp_prod_pvt = 0;

\

(_r)->req_cons = 0;

\

(_r)->nr_ents = __RING_SIZE(_s, __size);

\

(_r)->sring = (_s);

\
+} while (0)



+/* How big is this ring? */
+#define RING_SIZE(_r)
\

((_r)->nr_ents)


+/* Number of free requests (for use on front side only). */
+#define RING_FREE_REQUESTS(_r)
\

(RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))


+/* Test if there is an empty slot available on the front ring.


(This is only meaningful from the front. )


*/

+#define RING_FULL(_r)
\

(RING_FREE_REQUESTS(_r) == 0)


+/* Test if there are outstanding messages to be processed on a
ring. */
+#define RING_HAS_UNCONSUMED_RESPONSES(_r)
\

((_r)->sring->rsp_prod - (_r)->rsp_cons)


+#ifdef __GNUC__
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)
({                                       \

unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;

\

unsigned int rsp = RING_SIZE(_r) -

\

((_r)->req_cons - (_r)->rsp_prod_pvt);



\

req < rsp ? req : rsp;

\
+})
+#else
+/* Same as above, but without the nice GCC ({ ... }) syntax. */
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)
\

((((_r)->sring->req_prod - (_r)->req_cons) <

\

 (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ?



\

((_r)->sring->req_prod - (_r)->req_cons) :



\

(RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))



+#endif



+/* Direct access to individual ring elements, by index. */
+#define RING_GET_REQUEST(_r, _idx)
\

(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))


+/*


Get a local copy of a request.







Use this in preference to RING_GET_REQUEST() so all processing



is


done on a local copy that cannot be modified by the other end.







Note that



https://urldefense.com/v3/__https://eur01.safelinks.protection.outlook.com/?...
...
...
u.org%2Fbugzilla%2Fshow_bug.cgi%3Fid%3D58145&amp;data=02%7C01%7C
peng.fan%40nxp.com%7Cdd87f4854f514bc096ba08d81ddc0812%7C686ea1d
3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C637292178170181802&amp;sd
ata=hZDVA%2FOZbJO%2Fh4uzROYzVzmB05ekJWbcnkDAXsHzClc%3D&amp;re
served=0 may cause this


to be ineffective where _req is a struct which consists of only



bitfields.

*/

+#define RING_COPY_REQUEST(_r, _idx, _req) do {
\

/* Use volatile to force the copy into _req. */			
     \
*(_req) = *(volatile typeof(_req))RING_GET_REQUEST(_r, _idx);

\
+} while (0)



+#define RING_GET_RESPONSE(_r, _idx)
\

(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))


+/* Loop termination condition: Would the specified index overflow
the ring?
*/
+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)
\

(((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))


+/* Ill-behaved frontend determination: Can there be this many
requests? */
+#define RING_REQUEST_PROD_OVERFLOW(_r, _prod)
\

(((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))


+#define RING_PUSH_REQUESTS(_r) do
{                                               \

xen_wmb(); /* back sees requests /before/ updated producer

index */
\

(_r)->sring->req_prod = (_r)->req_prod_pvt;

\
+} while (0)



+#define RING_PUSH_RESPONSES(_r) do
{                                              \

xen_wmb(); /* front sees resps /before/ updated producer index

*/
\

(_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;

\
+} while (0)



+/*


Notification hold-off (req_event and rsp_event):







When queueing requests or responses on a shared ring, it may



not always
be


necessary to notify the remote end. For example, if requests



are in flight


in a backend, the front may be able to queue further requests



without


notifying the back (if the back checks for new requests when it



queues


responses).







When enqueuing requests or responses:







Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The



second argument


is a boolean return value. True indicates that the receiver



requires an


asynchronous notification.







After dequeuing requests or responses (before sleeping the



connection):






Use RING_FINAL_CHECK_FOR_REQUESTS() or



RING_FINAL_CHECK_FOR_RESPONSES().


The second argument is a boolean return value. True indicates



that there


are pending messages on the ring (i.e., the connection should



not be put


to sleep).







These macros will set the req_event/rsp_event field to trigger



a


notification on the very next message that is enqueued. If you



want to


create batches of work (i.e., only receive a notification



after several


messages have been enqueued) then you will need to create a



customised


version of the FINAL_CHECK macro in your own code, which sets



the
event


field appropriately.


*/


+#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do
{                     \

RING_IDX __old = (_r)->sring->req_prod;

\

RING_IDX __new = (_r)->req_prod_pvt;

\

xen_wmb(); /* back sees requests /before/ updated producer

index */
\

(_r)->sring->req_prod = __new;

\

xen_mb(); /* back sees new requests /before/ we check req_event

*/
\

(_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <

\

		 (RING_IDX)(__new -



__old));                      \
+} while (0)



+#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do
{                    \

RING_IDX __old = (_r)->sring->rsp_prod;

\

RING_IDX __new = (_r)->rsp_prod_pvt;

\

xen_wmb(); /* front sees resps /before/ updated producer index

*/
\

(_r)->sring->rsp_prod = __new;

\

xen_mb(); /* front sees new resps /before/ we check rsp_event

*/
\

(_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <

\

		 (RING_IDX)(__new -



__old));                      \
+} while (0)



+#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do
{                       \

(_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);

\

if (_work_to_do)							
\
break;



\

(_r)->sring->req_event = (_r)->req_cons + 1;

\

xen_mb();

\

(_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);

\
+} while (0)



+#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do
{                      \

(_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);

\

if (_work_to_do)							
\
break;



\

(_r)->sring->rsp_event = (_r)->rsp_cons + 1;

\

xen_mb();

\

(_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);

\
+} while (0)



+/*


DEFINE_XEN_FLEX_RING_AND_INTF defines two monodirectional rings



and


functions to check if there is data on the ring, and to read



and


write to them.







DEFINE_XEN_FLEX_RING is similar to



DEFINE_XEN_FLEX_RING_AND_INTF, but


does not define the indexes page. As different protocols can



have


extensions to the basic format, this macro allow them to define



their


own struct.







XEN_FLEX_RING_SIZE



Convenience macro to calculate the size of one of the two



rings


from the overall order.







$NAME_mask



Function to apply the size mask to an index, to reduce the



index


within the range [0-size].







$NAME_read_packet



Function to read data from the ring. The amount of data to



read is


specified by the "size" argument.







$NAME_write_packet



Function to write data to the ring. The amount of data to



write is


specified by the "size" argument.







$NAME_get_ring_ptr



Convenience function that returns a pointer to read/write to



the


ring at the right location.







$NAME_data_intf



Indexes page, shared between frontend and backend. It also



contains the array of grant refs.







$NAME_queued



Function to calculate how many bytes are currently on the



ring,


ready to be read. It can also be used to calculate how much



free


space is currently on the ring (XEN_FLEX_RING_SIZE() -



$NAME_queued()).


*/


+#ifndef XEN_PAGE_SHIFT
+/* The PAGE_SIZE for ring protocols and hypercall interfaces is
always


4K, regardless of the architecture, and page granularity chosen



by


operating systems.


*/

+#define XEN_PAGE_SHIFT 12
+#endif
+#define XEN_FLEX_RING_SIZE(order)
\

(1UL << ((order) + XEN_PAGE_SHIFT - 1))


+#define DEFINE_XEN_FLEX_RING(name)
\
+static inline RING_IDX name##_mask(RING_IDX idx, RING_IDX
ring_size)
\
+{
                     \

return idx & (ring_size - 1);

\
+}
\

							

\

+static inline unsigned char *name##_get_ring_ptr(unsigned char
*buf,
\

				 RING_IDX



idx,                    \

				 RING_IDX



ring_size)              \
+{
                     \

return buf + name##_mask(idx, ring_size);

\
+}
\

							

\

+static inline void name##_read_packet(void *opaque,
\

		      const unsigned char



*buf,                   \

		      size_t size,



\

		      RING_IDX masked_prod,



\

		      RING_IDX *masked_cons,



\

		      RING_IDX ring_size)



\
+{
                     \

if (*masked_cons < masked_prod ||

\

size <= ring_size - *masked_cons)



{                               \

memcpy(opaque, buf + *masked_cons, size);



\

} else

{
     \

memcpy(opaque, buf + *masked_cons, ring_size -



*masked_cons);
\

memcpy((unsigned char *)opaque + ring_size -



*masked_cons, buf,
\

	   size - (ring_size - *masked_cons));



\

}

\

*masked_cons = name##_mask(*masked_cons + size, ring_size);

\
+}
\

							

\

+static inline void name##_write_packet(unsigned char *buf,
\

		       const void *opaque,



\

		       size_t size,



\

		       RING_IDX *masked_prod,



\

		       RING_IDX masked_cons,



\

		       RING_IDX ring_size)



\
+{
                     \

if (*masked_prod < masked_cons ||

\

size <= ring_size - *masked_prod)



{                               \

memcpy(buf + *masked_prod, opaque, size);



\

} else

{
     \

memcpy(buf + *masked_prod, opaque, ring_size -



*masked_prod);
\

memcpy(buf, (unsigned char *)opaque + (ring_size -



*masked_prod),
\

       size - (ring_size - *masked_prod));



\

}

\

*masked_prod = name##_mask(*masked_prod + size, ring_size);

\
+}
\

							

\

+static inline RING_IDX name##_queued(RING_IDX prod,
\

		     RING_IDX cons,



\

		     RING_IDX ring_size)



\
+{
                     \

RING_IDX size;

\

							

\
if (prod == cons)

\

return 0;



\

							

\
prod = name##_mask(prod, ring_size);

\

cons = name##_mask(cons, ring_size);

\

							

\
if (prod == cons)

\

return ring_size;



\

							

\
if (prod > cons)

\

size = prod - cons;



\

else

\

size = ring_size - (cons - prod);



\

return size;

\
+}
\

							

\

+struct name##_data
{
 \

unsigned char *in; /* half of the allocation */

\

unsigned char *out; /* half of the allocation */

\
+}



+#define DEFINE_XEN_FLEX_RING_AND_INTF(name)
\
+struct name##_data_intf
{                                                         \

RING_IDX in_cons, in_prod;

\

							

\
u8 pad1[56];

\

							

\
RING_IDX out_cons, out_prod;

\

							

\
u8 pad2[56];

\

							

\
RING_IDX ring_order;

\

grant_ref_t ref[];

\
+};
\
+DEFINE_XEN_FLEX_RING(name)



+#endif /* __XEN_PUBLIC_IO_RING_H__ */



+/*


Local variables:



mode: C



c-file-style: "BSD"



c-basic-offset: 4



tab-width: 8



indent-tabs-mode: nil



End:


*/

diff --git a/include/xen/interface/io/xenbus.h
b/include/xen/interface/io/xenbus.h
new file mode 100644
index 0000000000..f452748b03
--- /dev/null
+++ b/include/xen/interface/io/xenbus.h
@@ -0,0 +1,81 @@
+/************************************************************



xenbus.h







Xenbus protocol details.







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (C) 2005 XenSource Ltd.


*/


+#ifndef _XEN_PUBLIC_IO_XENBUS_H
+#define _XEN_PUBLIC_IO_XENBUS_H



+/*


The state of either end of the Xenbus, i.e. the current



communication


status of initialisation across the bus.  States here imply



nothing about


the state of the connection between the driver and the kernel's



device


layers.


*/

+enum xenbus_state {

XenbusStateUnknown       = 0,

XenbusStateInitialising  = 1,

/*
* InitWait: Finished early initialisation but waiting for



information

* from the peer or hotplug scripts.


*/


XenbusStateInitWait      = 2,

/*
* Initialised: Waiting for a connection from the peer.


*/


XenbusStateInitialised   = 3,

XenbusStateConnected     = 4,

/*
* Closing: The device is being closed due to an error or an



unplug event.

*/


XenbusStateClosing       = 5,

XenbusStateClosed        = 6,

/*
* Reconfiguring: The device is being reconfigured.


*/


XenbusStateReconfiguring = 7,

XenbusStateReconfigured  = 8

+};



+typedef enum xenbus_state XenbusState;



+#endif /* _XEN_PUBLIC_IO_XENBUS_H */



+/*


Local variables:



mode: C



c-file-style: "BSD"



c-basic-offset: 4



tab-width: 4



indent-tabs-mode: nil



End:


*/

diff --git a/include/xen/interface/io/xs_wire.h
b/include/xen/interface/io/xs_wire.h
new file mode 100644
index 0000000000..87987334bf
--- /dev/null
+++ b/include/xen/interface/io/xs_wire.h
@@ -0,0 +1,151 @@
+/*


Details of the "wire" protocol between Xen Store Daemon and



client


library or guest kernel.







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (C) 2005 Rusty Russell IBM Corporation


*/


+#ifndef _XS_WIRE_H
+#define _XS_WIRE_H



+enum xsd_sockmsg_type {

XS_CONTROL,

+#define XS_DEBUG XS_CONTROL

XS_DIRECTORY,
XS_READ,
XS_GET_PERMS,
XS_WATCH,
XS_UNWATCH,
XS_TRANSACTION_START,
XS_TRANSACTION_END,
XS_INTRODUCE,
XS_RELEASE,
XS_GET_DOMAIN_PATH,
XS_WRITE,
XS_MKDIR,
XS_RM,
XS_SET_PERMS,
XS_WATCH_EVENT,
XS_ERROR,
XS_IS_DOMAIN_INTRODUCED,
XS_RESUME,
XS_SET_TARGET,
/* XS_RESTRICT has been removed */
XS_RESET_WATCHES = XS_SET_TARGET + 2,
XS_DIRECTORY_PART,

XS_TYPE_COUNT,      /* Number of valid types. */

XS_INVALID = 0xffff /* Guaranteed to remain an invalid type */

+};



+#define XS_WRITE_NONE "NONE"
+#define XS_WRITE_CREATE "CREATE"
+#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"



+/* We hand errors as strings, for portability. */
+struct xsd_errors {

int errnum;
const char *errstring;

+};



+#ifdef EINVAL
+#define XSD_ERROR(x) { x, #x }
+/* LINTED: static unused */
+static struct xsd_errors xsd_errors[]
+#if defined(__GNUC__)
+__attribute__((unused))
+#endif

= {
XSD_ERROR(EINVAL),
XSD_ERROR(EACCES),
XSD_ERROR(EEXIST),
XSD_ERROR(EISDIR),
XSD_ERROR(ENOENT),
XSD_ERROR(ENOMEM),
XSD_ERROR(ENOSPC),
XSD_ERROR(EIO),
XSD_ERROR(ENOTEMPTY),
XSD_ERROR(ENOSYS),
XSD_ERROR(EROFS),
XSD_ERROR(EBUSY),
XSD_ERROR(EAGAIN),
XSD_ERROR(EISCONN),
XSD_ERROR(E2BIG)

+};
+#endif



+struct xsd_sockmsg {

u32 type;  /* XS_??? */
u32 req_id;/* Request identifier, echoed in daemon's

response.  */

u32 tx_id; /* Transaction id (0 if not related to a

transaction). */

u32 len;   /* Length of data following this. */

/* Generally followed by nul-terminated string(s). */

+};



+enum xs_watch_type {

XS_WATCH_PATH = 0,
XS_WATCH_TOKEN

+};



+/*


`incontents 150 xenstore_struct XenStore wire protocol.







Inter-domain shared memory communications.


*/

+#define XENSTORE_RING_SIZE 1024
+typedef u32 XENSTORE_RING_IDX;
+#define MASK_XENSTORE_IDX(idx) ((idx) & (XENSTORE_RING_SIZE - 1))
+struct xenstore_domain_interface {

char req[XENSTORE_RING_SIZE]; /* Requests to xenstore daemon.

*/

char rsp[XENSTORE_RING_SIZE]; /* Replies and async watch

events. */

XENSTORE_RING_IDX req_cons, req_prod;
XENSTORE_RING_IDX rsp_cons, rsp_prod;
u32 server_features; /* Bitmap of features supported by the

server */

u32 connection;

+};



+/* Violating this is very bad.  See docs/misc/xenstore.txt. */
+#define XENSTORE_PAYLOAD_MAX 4096



+/* Violating these just gets you an error back */
+#define XENSTORE_ABS_PATH_MAX 3072
+#define XENSTORE_REL_PATH_MAX 2048



+/* The ability to reconnect a ring */
+#define XENSTORE_SERVER_FEATURE_RECONNECTION 1



+/* Valid values for the connection field */
+#define XENSTORE_CONNECTED 0 /* the steady-state */
+#define XENSTORE_RECONNECT 1 /* guest has initiated a reconnect */



+#endif /* _XS_WIRE_H */



+/*


Local variables:



mode: C



c-file-style: "BSD"



c-basic-offset: 4



tab-width: 8



indent-tabs-mode: nil



End:


*/

diff --git a/include/xen/interface/memory.h
b/include/xen/interface/memory.h
new file mode 100644
index 0000000000..19959da8b4
--- /dev/null
+++ b/include/xen/interface/memory.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/************************************************************



memory.h







Memory reservation and information.







Copyright (c) 2005, Keir Fraser keir@xensource.com


*/


+#ifndef __XEN_PUBLIC_MEMORY_H__
+#define __XEN_PUBLIC_MEMORY_H__



+/*


Increase or decrease the specified domain's memory reservation.



Returns
a


-ve errcode on failure, or the # extents successfully allocated



or freed.


arg == addr of struct xen_memory_reservation.


*/

+#define XENMEM_increase_reservation 0
+#define XENMEM_decrease_reservation 1
+#define XENMEM_populate_physmap     6
+struct xen_memory_reservation {

/*
* XENMEM_increase_reservation:


*   OUT: MFN (*not* GMFN) bases of extents that were allocated


* XENMEM_decrease_reservation:


*   IN:  GMFN bases of extents to free


* XENMEM_populate_physmap:


*   IN:  GPFN bases of extents to populate with memory


*   OUT: GMFN bases of extents that were allocated


*   (NB. This command also updates the mach_to_phys



translation
table)

*/


GUEST_HANDLE(xen_pfn_t)extent_start;

/* Number of extents, and size/alignment of each

(2^extent_order
pages). */

xen_ulong_t  nr_extents;
unsigned int   extent_order;

/*
* Maximum # bits addressable by the user of the allocated



region (e.g.,

* I/O devices often have a 32-bit limitation even in 64-bit



systems). If

* zero then the user has no addressing restriction.


* This field is not used by XENMEM_decrease_reservation.


*/


unsigned int   address_bits;

/*
* Domain whose reservation is being changed.


* Unprivileged domains can specify only DOMID_SELF.


*/


domid_t        domid;


+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);



+/*


An atomic exchange of memory pages. If return code is zero then



@out.extent_list provides GMFNs of the newly-allocated memory.



Returns zero on complete success, otherwise a negative error



code.


On complete success then always @nr_exchanged ==



@in.nr_extents.


On partial success @nr_exchanged indicates how much work was



done.

*/

+#define XENMEM_exchange             11
+struct xen_memory_exchange {

/*
* [IN] Details of memory extents to be exchanged (GMFN bases).


* Note that @in.address_bits is ignored and unused.


*/


struct xen_memory_reservation in;

/*
* [IN/OUT] Details of new memory extents.


* We require that:


*  1. @in.domid == @out.domid


*  2. @in.nr_extents  << @in.extent_order ==


*     @out.nr_extents << @out.extent_order


*  3. @in.extent_start and @out.extent_start lists must not



overlap

*  4. @out.extent_start lists GPFN bases to be populated


*  5. @out.extent_start is overwritten with allocated GMFN



bases

*/


struct xen_memory_reservation out;

/*
* [OUT] Number of input extents that were successfully



exchanged:

*  1. The first @nr_exchanged input extents were successfully


*     deallocated.


*  2. The corresponding first entries in the output extent



list correctly

*     indicate the GMFNs that were successfully exchanged.


*  3. All other input and output extents are untouched.


*  4. If not all input exents are exchanged then the return



code of this

*     command will be non-zero.


*  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!


*/


xen_ulong_t nr_exchanged;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
+/*


Returns the maximum machine frame number of mapped RAM in this



system.


This command always succeeds (it never returns an error code).



arg == NULL.


*/

+#define XENMEM_maximum_ram_page     2



+/*


Returns the current or maximum memory reservation, in pages, of



the


specified domain (may be DOMID_SELF). Returns -ve errcode on



failure.


arg == addr of domid_t.


*/

+#define XENMEM_current_reservation  3
+#define XENMEM_maximum_reservation  4



+/*


Returns a list of MFN bases of 2MB extents comprising the



machine_to_phys


mapping table. Architectures which do not have a m2p table do



not
implement


this command.



arg == addr of xen_machphys_mfn_list_t.


*/

+#define XENMEM_machphys_mfn_list    5
+struct xen_machphys_mfn_list {

/*
* Size of the 'extent_start' array. Fewer entries will be



filled if the

* machphys table is smaller than max_extents * 2MB.


*/


unsigned int max_extents;

/*
* Pointer to buffer to fill with list of extent starts. If



there are

* any large discontiguities in the machine address space, 2MB



gaps in

* the machphys table will be represented by an MFN base of



zero.

*/


GUEST_HANDLE(xen_pfn_t)extent_start;

/*
* Number of extents written to the above array. This will be



smaller

* than 'max_extents' if the machphys table is smaller than



max_e *
2MB.

*/


unsigned int nr_extents;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);



+/*


Returns the location in virtual address space of the



machine_to_phys


mapping table. Architectures which do not have a m2p table, or



which do
not


map it by default into guest address space, do not implement



this
command.


arg == addr of xen_machphys_mapping_t.


*/

+#define XENMEM_machphys_mapping     12
+struct xen_machphys_mapping {

xen_ulong_t v_start, v_end; /* Start and end virtual

addresses.   */

xen_ulong_t max_mfn;        /* Maximum MFN that can be looked

up.
*/
+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);



+#define XENMAPSPACE_shared_info  0 /* shared info page */
+#define XENMAPSPACE_grant_table  1 /* grant table page */
+#define XENMAPSPACE_gmfn         2 /* GMFN */
+#define XENMAPSPACE_gmfn_range   3 /* GMFN range,
XENMEM_add_to_physmap only. */
+#define XENMAPSPACE_gmfn_foreign 4 /* GMFN from another dom,

		    * XENMEM_add_to_physmap_range only.


		    */



+#define XENMAPSPACE_dev_mmio     5 /* device mmio region */



+/*


Sets the GPFN at which a particular page appears in the



specified guest's


pseudophysical address space.



arg == addr of xen_add_to_physmap_t.


*/

+#define XENMEM_add_to_physmap      7
+struct xen_add_to_physmap {

/* Which domain to change the mapping for. */
domid_t domid;

/* Number of pages to go through for gmfn_range */
u16    size;

/* Source mapping space. */
unsigned int space;

/* Index into source mapping space. */
xen_ulong_t idx;

/* GPFN where the source mapping page should appear. */
xen_pfn_t gpfn;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);



+/*** REMOVED ***/
+/*#define XENMEM_translate_gpfn_list  8*/



+#define XENMEM_add_to_physmap_range 23
+struct xen_add_to_physmap_range {

/* IN */
/* Which domain to change the mapping for. */
domid_t domid;
u16 space; /* => enum phys_map_space */

/* Number of pages to go through */
u16 size;
domid_t foreign_domid; /* IFF gmfn_foreign */

/* Indexes into space being mapped. */
GUEST_HANDLE(xen_ulong_t)idxs;

/* GPFN in domid where the source mapping page should appear.

*/

GUEST_HANDLE(xen_pfn_t)gpfns;

/* OUT */

/* Per index error code. */
GUEST_HANDLE(int)errs;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);



+/*


Returns the pseudo-physical memory map as it was when the



domain


was started (specified by XENMEM_set_memory_map).



arg == addr of struct xen_memory_map.


*/

+#define XENMEM_memory_map           9
+struct xen_memory_map {

/*
* On call the number of entries which can be stored in buffer.



On

* return the number of entries which have been stored in


* buffer.


*/


unsigned int nr_entries;

/*
* Entries in the buffer are in the same format as returned by



the

* BIOS INT 0x15 EAX=0xE820 call.


*/


GUEST_HANDLE(void)buffer;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);



+/*


Returns the real physical memory map. Passes the same structure



as


XENMEM_memory_map.



arg == addr of struct xen_memory_map.


*/

+#define XENMEM_machine_memory_map   10



+/*


Unmaps the page appearing at a particular GPFN from the



specified
guest's


pseudophysical address space.



arg == addr of xen_remove_from_physmap_t.


*/

+#define XENMEM_remove_from_physmap      15
+struct xen_remove_from_physmap {

/* Which domain to change the mapping for. */
domid_t domid;

/* GPFN of the current mapping of the page. */
xen_pfn_t gpfn;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);



+/*


Get the pages for a particular guest resource, so that they can



be


mapped directly by a tools domain.


*/

+#define XENMEM_acquire_resource 28
+struct xen_mem_acquire_resource {

/* IN - The domain whose resource is to be mapped */
domid_t domid;
/* IN - the type of resource */
u16 type;


+#define XENMEM_resource_ioreq_server 0
+#define XENMEM_resource_grant_table 1


/*
* IN - a type-specific resource identifier, which must be zero


*      unless stated otherwise.


*


* type == XENMEM_resource_ioreq_server -> id == ioreq server



id

* type == XENMEM_resource_grant_table -> id defined below


*/


u32 id;


+#define XENMEM_resource_grant_table_id_shared 0
+#define XENMEM_resource_grant_table_id_status 1


/* IN/OUT - As an IN parameter number of frames of the resource
*          to be mapped. However, if the specified value is 0



and

*          frame_list is NULL then this field will be set to



the

*          maximum value supported by the implementation on



return.

*/


u32 nr_frames;
/*
* OUT - Must be zero on entry. On return this may contain a



bitwise

*       OR of the following values.


*/


u32 flags;

/* The resource pages have been assigned to the calling domain

*/
+#define _XENMEM_rsrc_acq_caller_owned 0
+#define XENMEM_rsrc_acq_caller_owned (1u <<
_XENMEM_rsrc_acq_caller_owned)


/*
* IN - the index of the initial frame to be mapped. This



parameter

*      is ignored if nr_frames is 0.


*/


u64 frame;


+#define XENMEM_resource_ioreq_server_frame_bufioreq 0
+#define XENMEM_resource_ioreq_server_frame_ioreq(n) (1 + (n))


/*
* IN/OUT - If the tools domain is PV then, upon return,



frame_list

*          will be populated with the MFNs of the resource.


*          If the tools domain is HVM then it is expected



that, on

*          entry, frame_list will be populated with a list of



GFNs

*          that will be mapped to the MFNs of the resource.


*          If -EIO is returned then the frame_list has only



been

*          partially mapped and it is up to the caller to



unmap all

*          the GFNs.


*          This parameter may be NULL if nr_frames is 0.


*/


GUEST_HANDLE(xen_pfn_t)frame_list;

+};



+DEFINE_GUEST_HANDLE_STRUCT(xen_mem_acquire_resource);



+#endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/sched.h
b/include/xen/interface/sched.h
new file mode 100644
index 0000000000..0f12dcf267
--- /dev/null
+++ b/include/xen/interface/sched.h
@@ -0,0 +1,188 @@
+/************************************************************



sched.h







Scheduler state interactions







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (c) 2005, Keir Fraser keir@xensource.com


*/


+#ifndef __XEN_PUBLIC_SCHED_H__
+#define __XEN_PUBLIC_SCHED_H__



+#include <xen/interface/event_channel.h>



+/*


Guest Scheduler Operations







The SCHEDOP interface provides mechanisms for a guest to



interact


with the scheduler, including yield, blocking and shutting



itself


down.


*/


+/*


The prototype for this hypercall is:



long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...)







@cmd == SCHEDOP_??? (scheduler operation).



@arg == Operation-specific extra argument(s), as described



below.


...  == Additional Operation-specific extra arguments,



described below.






Versions of Xen prior to 3.0.2 provided only the following



legacy version


of this hypercall, supporting only the commands yield, block



and
shutdown:


long sched_op(int cmd, unsigned long arg)



@cmd == SCHEDOP_??? (scheduler operation).



@arg == 0               (SCHEDOP_yield and SCHEDOP_block)



 == SHUTDOWN_* code (SCHEDOP_shutdown)









This legacy version is available to new guests as:



long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned



long
arg)

*/


+/*


Voluntarily yield the CPU.



@arg == NULL.


*/

+#define SCHEDOP_yield       0



+/*


Block execution of this VCPU until an event is received for



processing.


If called with event upcalls masked, this operation will



atomically


reenable event delivery and check for pending events before



blocking the


VCPU. This avoids a "wakeup waiting" race.



@arg == NULL.


*/

+#define SCHEDOP_block       1



+/*


Halt execution of this domain (all VCPUs) and notify the system



controller.


@arg == pointer to sched_shutdown structure.







If the sched_shutdown_t reason is SHUTDOWN_suspend then



x86 PV guests must also set RDX (EDX for 32-bit guests) to the



MFN


of the guest's start info page.  RDX/EDX is the third hypercall



argument.







In addition, which reason is SHUTDOWN_suspend this hypercall



returns 1 if suspend was cancelled or the domain was merely



checkpointed, and 0 if it is resuming in a new domain.


*/

+#define SCHEDOP_shutdown    2



+/*


Poll a set of event-channel ports. Return when one or more are



pending.
An


optional timeout may be specified.



@arg == pointer to sched_poll structure.


*/

+#define SCHEDOP_poll        3



+/*


Declare a shutdown for another domain. The main use of this



function is


in interpreting shutdown requests and reasons for fully-



virtualized


domains.  A para-virtualized domain may use SCHEDOP_shutdown



directly.


@arg == pointer to sched_remote_shutdown structure.


*/

+#define SCHEDOP_remote_shutdown        4



+/*


Latch a shutdown code, so that when the domain later shuts down



it


reports this code to the control tools.



@arg == sched_shutdown, as for SCHEDOP_shutdown.


*/

+#define SCHEDOP_shutdown_code 5



+/*


Setup, poke and destroy a domain watchdog timer.



@arg == pointer to sched_watchdog structure.



With id == 0, setup a domain watchdog timer to cause domain



shutdown


          after timeout, returns watchdog id.





With id != 0 and timeout == 0, destroy domain watchdog timer.



With id != 0 and timeout != 0, poke watchdog timer and set new



timeout.

*/

+#define SCHEDOP_watchdog    6



+/*


Override the current vcpu affinity by pinning it to one



physical cpu or


undo this override restoring the previous affinity.



@arg == pointer to sched_pin_override structure.







A negative pcpu value will undo a previous pin override and



restore the


previous cpu affinity.



This call is allowed for the hardware domain only and requires



the cpu


to be part of the domain's cpupool.


*/

+#define SCHEDOP_pin_override 7



+struct sched_shutdown {

unsigned int reason; /* SHUTDOWN_* => shutdown reason */

+};



+DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);



+struct sched_poll {

GUEST_HANDLE(evtchn_port_t)ports;
unsigned int nr_ports;
u64 timeout;

+};



+DEFINE_GUEST_HANDLE_STRUCT(sched_poll);



+struct sched_remote_shutdown {

domid_t domain_id;         /* Remote domain ID */
unsigned int reason;       /* SHUTDOWN_* => shutdown reason */

+};



+DEFINE_GUEST_HANDLE_STRUCT(sched_remote_shutdown);



+struct sched_watchdog {

u32 id;                /* watchdog ID */
u32 timeout;           /* timeout */

+};



+DEFINE_GUEST_HANDLE_STRUCT(sched_watchdog);



+struct sched_pin_override {

s32 pcpu;

+};



+DEFINE_GUEST_HANDLE_STRUCT(sched_pin_override);



+/*


Reason codes for SCHEDOP_shutdown. These may be interpreted by



control


software to determine the appropriate action. For the most



part, Xen does


not care about the shutdown code.


*/

+#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up
and kill. */
+#define SHUTDOWN_reboot     1  /* Clean up, kill, and then
restart.
*/
+#define SHUTDOWN_suspend    2  /* Clean up, save suspend info,
kill.
*/
+#define SHUTDOWN_crash      3  /* Tell controller we've crashed.
*/
+#define SHUTDOWN_watchdog   4  /* Restart because watchdog time
expired.     */



+/*


Domain asked to perform 'soft reset' for it. The expected



behavior is to


reset internal Xen state for the domain returning it to the



point where it


was created but leaving the domain's memory contents and vCPU



contexts


intact. This will allow the domain to start over and set up all



Xen specific


interfaces again.


*/

+#define SHUTDOWN_soft_reset 5
+#define SHUTDOWN_MAX        5  /* Maximum valid shutdown reason.
*/



+#endif /* __XEN_PUBLIC_SCHED_H__ */
diff --git a/include/xen/interface/xen.h
b/include/xen/interface/xen.h
new file mode 100644
index 0000000000..964daaedfb
--- /dev/null
+++ b/include/xen/interface/xen.h
@@ -0,0 +1,225 @@
+/************************************************************



xen.h







Guest OS interface to Xen.







Permission is hereby granted, free of charge, to any person



obtaining a
copy


of this software and associated documentation files (the



"Software"), to


deal in the Software without restriction, including without



limitation the


rights to use, copy, modify, merge, publish, distribute,



sublicense, and/or


sell copies of the Software, and to permit persons to whom the



Software is


furnished to do so, subject to the following conditions:







The above copyright notice and this permission notice shall be



included in


all copies or substantial portions of the Software.







THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY



KIND, EXPRESS OR


IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF



MERCHANTABILITY,


FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO



EVENT SHALL THE


AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,



DAMAGES OR OTHER


LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,



ARISING


FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE



OR OTHER


DEALINGS IN THE SOFTWARE.







Copyright (c) 2004, K A Fraser


*/


+#ifndef __XEN_PUBLIC_XEN_H__
+#define __XEN_PUBLIC_XEN_H__



+#include <xen/arm/interface.h>



+/*


XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).


*/


+/*


x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3,



4, 5.


    EAX = return value





    (argument registers may be clobbered on return)





x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2,



3, 4, 5, 6.


    RAX = return value





    (argument registers not clobbered on return; RCX, R11





are)

*/

+#define __HYPERVISOR_set_trap_table        0
+#define __HYPERVISOR_mmu_update            1
+#define __HYPERVISOR_set_gdt               2
+#define __HYPERVISOR_stack_switch          3
+#define __HYPERVISOR_set_callbacks         4
+#define __HYPERVISOR_fpu_taskswitch        5
+#define __HYPERVISOR_sched_op_compat       6
+#define __HYPERVISOR_platform_op           7
+#define __HYPERVISOR_set_debugreg          8
+#define __HYPERVISOR_get_debugreg          9
+#define __HYPERVISOR_update_descriptor    10
+#define __HYPERVISOR_memory_op            12
+#define __HYPERVISOR_multicall            13
+#define __HYPERVISOR_update_va_mapping    14
+#define __HYPERVISOR_set_timer_op         15
+#define __HYPERVISOR_event_channel_op_compat 16
+#define __HYPERVISOR_xen_version          17
+#define __HYPERVISOR_console_io           18
+#define __HYPERVISOR_physdev_op_compat    19
+#define __HYPERVISOR_grant_table_op       20
+#define __HYPERVISOR_vm_assist            21
+#define __HYPERVISOR_update_va_mapping_otherdomain 22
+#define __HYPERVISOR_iret                 23 /* x86 only */
+#define __HYPERVISOR_vcpu_op              24
+#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
+#define __HYPERVISOR_mmuext_op            26
+#define __HYPERVISOR_xsm_op               27
+#define __HYPERVISOR_nmi_op               28
+#define __HYPERVISOR_sched_op             29
+#define __HYPERVISOR_callback_op          30
+#define __HYPERVISOR_xenoprof_op          31
+#define __HYPERVISOR_event_channel_op     32
+#define __HYPERVISOR_physdev_op           33
+#define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_sysctl               35
+#define __HYPERVISOR_domctl               36
+#define __HYPERVISOR_kexec_op             37
+#define __HYPERVISOR_tmem_op              38
+#define __HYPERVISOR_xc_reserved_op       39 /* reserved for
XenClient */
+#define __HYPERVISOR_xenpmu_op            40
+#define __HYPERVISOR_dm_op                41



+/* Architecture-specific hypercall definitions. */
+#define __HYPERVISOR_arch_0               48
+#define __HYPERVISOR_arch_1               49
+#define __HYPERVISOR_arch_2               50
+#define __HYPERVISOR_arch_3               51
+#define __HYPERVISOR_arch_4               52
+#define __HYPERVISOR_arch_5               53
+#define __HYPERVISOR_arch_6               54
+#define __HYPERVISOR_arch_7               55



+#ifndef __ASSEMBLY__



+typedef u16 domid_t;



+/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary
domains. */
+#define DOMID_FIRST_RESERVED (0x7FF0U)



+/* DOMID_SELF is used in certain contexts to refer to oneself. */
+#define DOMID_SELF (0x7FF0U)



+/*


DOMID_IO is used to restrict page-table updates to mapping I/O



memory.


Although no Foreign Domain need be specified to map I/O pages,



DOMID_IO


is useful to ensure that no mappings to the OS's own heap are



accidentally


installed. (e.g., in Linux this could cause havoc as reference



counts


aren't adjusted on the I/O-mapping code path).



This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that



context can


be specified by any calling domain.


*/

+#define DOMID_IO   (0x7FF1U)



+/*


DOMID_XEN is used to allow privileged domains to map restricted



parts of


Xen's heap space (e.g., the machine_to_phys table).



This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only



permitted if


the caller is privileged.


*/

+#define DOMID_XEN  (0x7FF2U)



+/* DOMID_COW is used as the owner of sharable pages */
+#define DOMID_COW  (0x7FF3U)



+/* DOMID_INVALID is used to identify pages with unknown owner. */
+#define DOMID_INVALID (0x7FF4U)



+/* Idle domain. */
+#define DOMID_IDLE (0x7FFFU)



+struct vcpu_info {

/*
* 'evtchn_upcall_pending' is written non-zero by Xen to



indicate

* a pending notification for a particular VCPU. It is then



cleared

* by the guest OS /before/ checking for pending work, thus



avoiding

* a set-and-check race. Note that the mask is only accessed by



Xen

* on the CPU that is currently hosting the VCPU. This means



that the

* pending and mask flags can be updated by the guest without



special

* synchronisation (i.e., no need for the x86 LOCK prefix).


* This may seem suboptimal because if the pending flag is set



by

* a different CPU then an IPI may be scheduled even when the



mask

* is set. However, note:


*  1. The task of 'interrupt holdoff' is covered by the per-



event-

*     channel mask bits. A 'noisy' event that is continually



being

*     triggered can be masked at source at this very precise


*     granularity.


*  2. The main purpose of the per-VCPU mask is therefore to



restrict

*     reentrant execution: whether for concurrency control, or



to

*     prevent unbounded stack usage. Whatever the purpose, we



expect

*     that the mask will be asserted only for short periods at



a time,

*     and so the likelihood of a 'spurious' IPI is suitably



small.

* The mask is read before making an event upcall to the guest:



a

* non-zero mask therefore guarantees that the VCPU will not



receive

* an upcall activation. The mask is cleared when the VCPU



requests

* to block: this avoids wakeup-waiting races.


*/


u8 evtchn_upcall_pending;
u8 evtchn_upcall_mask;
xen_ulong_t evtchn_pending_sel;
struct arch_vcpu_info arch;
struct pvclock_vcpu_time_info time;

+}; /* 64 bytes (x86) */



+/*


Xen/kernel shared data -- pointer provided in start_info.



NB. We expect that this struct is smaller than a page.


*/

+struct shared_info {

struct vcpu_info vcpu_info[MAX_VIRT_CPUS];

/*
* A domain can create "event channels" on which it can send



and
receive

* asynchronous event notifications. There are three classes of



event
that

* are delivered by this mechanism:


*  1. Bi-directional inter- and intra-domain connections.



Domains must

*     arrange out-of-band to set up a connection (usually by



allocating

*     an unbound 'listener' port and avertising that via a



storage
service

*     such as xenstore).


*  2. Physical interrupts. A domain with suitable hardware-



access

*     privileges can bind an event-channel port to a physical



interrupt

*     source.


*  3. Virtual interrupts ('events'). A domain can bind an



event-channel

*     port to a virtual interrupt source, such as the virtual-



timer

*     device or the emergency console.


*


* Event channels are addressed by a "port index". Each channel



is

* associated with two bits of information:


*  1. PENDING -- notifies the domain that there is a pending



notification

*     to be processed. This bit is cleared by the guest.


*  2. MASK -- if this bit is clear then a 0->1 transition of



PENDING

*     will cause an asynchronous upcall to be scheduled. This



bit is
only

*     updated by the guest. It is read-only within Xen. If a



channel

*     becomes pending while the channel is masked then the



'edge' is
lost

*     (i.e., when the channel is unmasked, the guest must



manually
handle

*     pending notifications as no upcall will be scheduled by



Xen).

*


* To expedite scanning of pending notifications, any 0->1



pending

* transition on an unmasked channel causes a corresponding bit



in a

* per-vcpu selector word to be set. Each bit in the selector



covers a

* 'C long' in the PENDING bitfield array.


*/


xen_ulong_t evtchn_pending[sizeof(xen_ulong_t) * 8];
xen_ulong_t evtchn_mask[sizeof(xen_ulong_t) * 8];

/*
* Wallclock time: updated only by control software. Guests



should base

* their gettimeofday() syscall on this wallclock-base value.


*/


struct pvclock_wall_clock wc;

struct arch_shared_info arch;


+};



+#else /* __ASSEMBLY__ */



+/* In assembly code we cannot use C numeric constant suffixes. */
+#define mk_unsigned_long(x) x



+#endif /* !__ASSEMBLY__ */



+#endif /* __XEN_PUBLIC_XEN_H__ */
2.17.1